PyPI - palimpzest - Versions diffs - 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

palimpzest/constants.py +13 -4
palimpzest/core/data/dataset.py +75 -5
palimpzest/core/elements/groupbysig.py +5 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +26 -3
palimpzest/core/models.py +4 -4
palimpzest/prompts/aggregate_prompts.py +99 -0
palimpzest/prompts/prompt_factory.py +162 -75
palimpzest/prompts/utils.py +38 -1
palimpzest/prompts/validator.py +24 -24
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +8 -8
palimpzest/query/execution/mab_execution_strategy.py +30 -11
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +9 -7
palimpzest/query/operators/__init__.py +10 -6
palimpzest/query/operators/aggregate.py +394 -10
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +36 -11
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +11 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +73 -13
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/utils/progress.py +19 -17
palimpzest/validator/validator.py +7 -7
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0

palimpzest/constants.py CHANGED Viewed

@@ -136,13 +136,17 @@ class PromptStrategy(str, Enum):
     performing some task with a specified Model.
     """
+    # aggregation prompt strategies
+    AGG = "aggregation"
+    AGG_NO_REASONING = "aggregation-no-reasoning"
     # filter prompt strategies
     FILTER = "filter"
     FILTER_NO_REASONING = "filter-no-reasoning"
     FILTER_CRITIC = "filter-critic"
     FILTER_REFINE = "filter-refine"
     FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
-    FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
+    FILTER_MOA_AGG = "filter-mixture-of-agents-aggregator"
     FILTER_SPLIT_PROPOSER = "filter-split-proposer"
     FILTER_SPLIT_MERGER = "filter-split-merger"
@@ -156,10 +160,13 @@ class PromptStrategy(str, Enum):
     MAP_CRITIC = "map-critic"
     MAP_REFINE = "map-refine"
     MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
-    MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
+    MAP_MOA_AGG = "map-mixture-of-agents-aggregator"
     MAP_SPLIT_PROPOSER = "map-split-proposer"
     MAP_SPLIT_MERGER = "map-split-merger"
+    def is_agg_prompt(self):
+        return "aggregation" in self.value
     def is_filter_prompt(self):
         return "filter" in self.value
@@ -179,7 +186,7 @@ class PromptStrategy(str, Enum):
         return "mixture-of-agents-proposer" in self.value
     def is_moa_aggregator_prompt(self):
-        return "mixture-of-agents-aggregation" in self.value
+        return "mixture-of-agents-aggregator" in self.value
     def is_split_proposer_prompt(self):
         return "split-proposer" in self.value
@@ -200,7 +207,9 @@ class Modality(str, Enum):
 class AggFunc(str, Enum):
     COUNT = "count"
     AVERAGE = "average"
+    SUM = "sum"
+    MIN = "min"
+    MAX = "max"
 class Cardinality(str, Enum):
     ONE_TO_ONE = "one-to-one"

palimpzest/core/data/dataset.py CHANGED Viewed

@@ -22,7 +22,7 @@ from palimpzest.query.operators.logical import (
     LimitScan,
     LogicalOperator,
     Project,
-    RetrieveScan,
+    TopKScan,
 )
 from palimpzest.query.processor.config import QueryProcessorConfig
 from palimpzest.utils.hash_helpers import hash_for_serialized_dict
@@ -243,7 +243,30 @@ class Dataset:
             id=self.id,
         )
-    def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
+    def join(self, other: Dataset, on: str | list[str], how: str = "inner") -> Dataset:
+        """
+        Perform the specified join on the specified (list of) column(s)
+        """
+        # enforce type for on
+        if isinstance(on, str):
+            on = [on]
+        # construct new output schema
+        combined_schema = union_schemas([self.schema, other.schema], join=True, on=on)
+        # construct logical operator
+        operator = JoinOp(
+            input_schema=combined_schema,
+            output_schema=combined_schema,
+            condition="",
+            on=on,
+            how=how,
+            depends_on=on,
+        )
+        return Dataset(sources=[self, other], operator=operator, schema=combined_schema)
+    def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None, how: str = "inner") -> Dataset:
         """
         Perform a semantic (inner) join on the specified join predicate
         """
@@ -259,6 +282,7 @@ class Dataset:
             input_schema=combined_schema,
             output_schema=combined_schema,
             condition=condition,
+            how=how,
             desc=desc,
             depends_on=depends_on,
         )
@@ -346,7 +370,6 @@ class Dataset:
         return Dataset(sources=[self], operator=operator, schema=new_output_schema)
     def sem_add_columns(self, cols: list[dict] | type[BaseModel],
                         cardinality: Cardinality = Cardinality.ONE_TO_ONE,
                         desc: str | None = None,
@@ -534,12 +557,59 @@ class Dataset:
         operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
         return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def sum(self) -> Dataset:
+        """Apply a summation to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.SUM)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def min(self) -> Dataset:
+        """Apply an min operator to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def max(self) -> Dataset:
+        """Apply an max operator to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
     def groupby(self, groupby: GroupBySig) -> Dataset:
         output_schema = groupby.output_schema()
         operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby)
         return Dataset(sources=[self], operator=operator, schema=output_schema)
-    def retrieve(
+    def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset:
+        """
+        Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM
+        over the entire set of inputs' fields specified in `depends_on` to generate the output `col`.
+        Example:
+            sem_agg(
+                col={'name': 'overall_sentiment', 'desc': 'The overall sentiment of the reviews', 'type': str},
+                agg="Compute the overall sentiment of the reviews as POSITIVE or NEGATIVE.",
+                depends_on="review_text",
+            )
+        """
+        # construct new output schema
+        new_output_schema = None
+        if isinstance(col, dict):
+            col_schema = create_schema_from_fields([col])
+            new_output_schema = union_schemas([self.schema, col_schema])
+        elif issubclass(col, BaseModel):
+            assert len(col.model_fields) == 1, "For semantic aggregation, when passing a BaseModel to `col` it must have exactly one field."
+            new_output_schema = union_schemas([self.schema, col])
+        else:
+            raise ValueError("`col` must be a dictionary or a single-field BaseModel.")
+        # enforce type for depends_on
+        if isinstance(depends_on, str):
+            depends_on = [depends_on]
+        # construct logical operator
+        operator = Aggregate(input_schema=self.schema, output_schema=new_output_schema, agg_str=agg, depends_on=depends_on)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def sem_topk(
         self,
         index: Collection,
         search_attr: str,
@@ -566,7 +636,7 @@ class Dataset:
         # index = index_factory(index)
         # construct logical operator
-        operator = RetrieveScan(
+        operator = TopKScan(
             input_schema=self.schema,
             output_schema=new_output_schema,
             index=index,

palimpzest/core/elements/groupbysig.py CHANGED Viewed

@@ -6,8 +6,11 @@ from pydantic import BaseModel
 from palimpzest.core.lib.schemas import create_schema_from_fields
+# TODO:
+# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
+# - construct the correct output schema using the input schema and the group by and aggregation fields
+# - remove/update all other references to GroupBySig in the codebase
-# TODO: need to rethink how group bys work
 # signature for a group by aggregate that applies
 # group and aggregation to an input tuple
 class GroupBySig:
@@ -50,6 +53,7 @@ class GroupBySig:
             ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
         return ops
+    # TODO: output schema needs to account for input schema types and create new output schema types
     def output_schema(self) -> type[BaseModel]:
         # the output class varies depending on the group by, so here
         # we dynamically construct this output

palimpzest/core/elements/records.py CHANGED Viewed

@@ -140,7 +140,7 @@ class DataRecord:
     def schema(self) -> type[BaseModel]:
         return type(self._data_item)
-    def copy(self):
+    def copy(self) -> DataRecord:
         # get the set of fields to copy from the parent record
         copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
@@ -228,18 +228,18 @@ class DataRecord:
     @staticmethod
     def from_join_parents(
         schema: type[BaseModel],
-        left_parent_record: DataRecord,
-        right_parent_record: DataRecord,
+        left_parent_record: DataRecord | None,
+        right_parent_record: DataRecord | None,
         project_cols: list[str] | None = None,
         cardinality_idx: int = None,
     ) -> DataRecord:
         # get the set of fields and field descriptions to copy from the parent record(s)
-        left_copy_field_names = (
+        left_copy_field_names = [] if left_parent_record is None else (
             left_parent_record.get_field_names()
             if project_cols is None
             else [col for col in project_cols if col in left_parent_record.get_field_names()]
         )
-        right_copy_field_names = (
+        right_copy_field_names = [] if right_parent_record is None else (
             right_parent_record.get_field_names()
             if project_cols is None
             else [col for col in project_cols if col in right_parent_record.get_field_names()]
@@ -255,11 +255,20 @@ class DataRecord:
                 new_field_name = f"{field_name}_right"
             data_item[new_field_name] = right_parent_record[field_name]
+        # for any missing fields in the schema, set them to None
+        for field_name in schema.model_fields:
+            if field_name not in data_item:
+                data_item[field_name] = None
         # make new record which has left and right parent record as its parents
+        left_parent_source_indices = [] if left_parent_record is None else list(left_parent_record._source_indices)
+        right_parent_source_indices = [] if right_parent_record is None else list(right_parent_record._source_indices)
+        left_parent_record_id = [] if left_parent_record is None else [left_parent_record._id]
+        right_parent_record_id = [] if right_parent_record is None else [right_parent_record._id]
         new_dr = DataRecord(
             schema(**data_item),
-            source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
-            parent_ids=[left_parent_record._id, right_parent_record._id],
+            source_indices=left_parent_source_indices + right_parent_source_indices,
+            parent_ids=left_parent_record_id + right_parent_record_id,
             cardinality_idx=cardinality_idx,
         )

palimpzest/core/lib/schemas.py CHANGED Viewed

@@ -142,16 +142,30 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
     return _create_pickleable_model(fields)
-def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
+def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]:
     """Union multiple Pydantic models into a single model."""
+    # convert on to empty list if None
+    if on is None:
+        on = []
+    # build up the fields for the new schema
     fields = {}
     for model in models:
         for field_name, field in model.model_fields.items():
-            if field_name in fields and not join:
+            # for non-join unions, make sure duplicate fields have the same type
+            if not join and field_name in fields:
                 assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
-            elif field_name in fields and join:
+            # for joins with "on" specified, no need to rename fields in "on"
+            elif join and field_name in on and field_name in fields:
+                continue
+            # otherwise, rename duplicate fields by appending _right
+            elif join and field_name in fields:
                 while field_name in fields:
                     field_name = f"{field_name}_right"
+            # add the field to the new schema
             fields[field_name] = (field.annotation, field)
     # create and return the new schema
@@ -194,6 +208,15 @@ class Average(BaseModel):
 class Count(BaseModel):
     count: int = Field(description="The count of items in the dataset")
+class Sum(BaseModel):
+    sum: int = Field(description="The summation of items in the dataset")
+class Min(BaseModel):
+    min: int | float = Field(description="The minimum value of some items in the dataset")
+class Max(BaseModel):
+    max: int | float = Field(description="The maximum value of some items in the dataset")
 class OperatorDerivedSchema(BaseModel):
     """Schema defined by an operator, e.g., a join or a group by"""

palimpzest/core/models.py CHANGED Viewed

@@ -51,10 +51,10 @@ class GenerationStats(BaseModel):
     fn_call_duration_secs: float = 0.0
     # (if applicable) the total number of LLM calls made by this operator
-    total_llm_calls: int = 0
+    total_llm_calls: float = 0
     # (if applicable) the total number of embedding LLM calls made by this operator
-    total_embedding_llm_calls: int = 0
+    total_embedding_llm_calls: float = 0
     def __iadd__(self, other: GenerationStats) -> GenerationStats:
         # self.raw_answers.extend(other.raw_answers)
@@ -243,10 +243,10 @@ class RecordOpStats(BaseModel):
     fn_call_duration_secs: float = 0.0
     # (if applicable) the total number of LLM calls made by this operator
-    total_llm_calls: int = 0
+    total_llm_calls: float = 0
     # (if applicable) the total number of embedding LLM calls made by this operator
-    total_embedding_llm_calls: int = 0
+    total_embedding_llm_calls: float = 0
     # (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
     failed_convert: bool | None = None

palimpzest/prompts/aggregate_prompts.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""This file contains prompts for aggregation operations."""
+### BASE PROMPTS ###
+AGG_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{output_format_instruction} Finish your response with a newline character followed by ---
+An example is shown below:
+---
+INPUT FIELDS:
+{example_input_fields}
+OUTPUT FIELDS:
+{example_output_fields}
+CONTEXT:
+{{{example_context}}}
+{{{second_example_context}}}
+{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
+AGGREGATION INSTRUCTION: {example_agg_instruction}
+Let's think step-by-step in order to answer the question.
+REASONING: {example_reasoning}
+ANSWER:
+{{{example_answer}}}
+---
+"""
+AGG_NO_REASONING_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{output_format_instruction} Finish your response with a newline character followed by ---
+An example is shown below:
+---
+INPUT FIELDS:
+{example_input_fields}
+OUTPUT FIELDS:
+{example_output_fields}
+CONTEXT:
+{{{example_context}}}
+{{{second_example_context}}}
+{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
+AGGREGATION INSTRUCTION: {example_agg_instruction}
+ANSWER:
+{{{example_answer}}}
+---
+"""
+AGG_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{desc_section}
+{output_format_instruction} Finish your response with a newline character followed by ---
+---
+INPUT FIELDS:
+{input_fields_desc}
+OUTPUT FIELDS:
+{output_fields_desc}
+CONTEXT:
+{context}<<image-audio-placeholder>>
+AGGREGATION INSTRUCTION: {agg_instruction}
+Let's think step-by-step in order to answer the question.
+REASONING: """
+AGG_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{desc_section}
+{output_format_instruction} Finish your response with a newline character followed by ---
+---
+INPUT FIELDS:
+{input_fields_desc}
+OUTPUT FIELDS:
+{output_fields_desc}
+CONTEXT:
+{context}<<image-audio-placeholder>>
+AGGREGATION INSTRUCTION: {agg_instruction}
+ANSWER: """

palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl