PyPI - palimpzest - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

palimpzest 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

palimpzest/constants.py +38 -62
palimpzest/core/data/iter_dataset.py +5 -5
palimpzest/core/elements/groupbysig.py +1 -1
palimpzest/core/elements/records.py +91 -109
palimpzest/core/lib/schemas.py +23 -0
palimpzest/core/models.py +3 -3
palimpzest/prompts/__init__.py +2 -6
palimpzest/prompts/convert_prompts.py +10 -66
palimpzest/prompts/critique_and_refine_prompts.py +66 -0
palimpzest/prompts/filter_prompts.py +8 -46
palimpzest/prompts/join_prompts.py +12 -75
palimpzest/prompts/{moa_aggregator_convert_prompts.py → moa_aggregator_prompts.py} +51 -2
palimpzest/prompts/moa_proposer_prompts.py +87 -0
palimpzest/prompts/prompt_factory.py +351 -479
palimpzest/prompts/split_merge_prompts.py +51 -2
palimpzest/prompts/split_proposer_prompts.py +48 -16
palimpzest/prompts/utils.py +109 -0
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/mab_execution_strategy.py +1 -2
palimpzest/query/execution/parallel_execution_strategy.py +3 -3
palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
palimpzest/query/generators/generators.py +31 -17
palimpzest/query/operators/__init__.py +15 -2
palimpzest/query/operators/aggregate.py +21 -19
palimpzest/query/operators/compute.py +6 -8
palimpzest/query/operators/convert.py +12 -37
palimpzest/query/operators/critique_and_refine.py +194 -0
palimpzest/query/operators/distinct.py +7 -7
palimpzest/query/operators/filter.py +13 -25
palimpzest/query/operators/join.py +321 -192
palimpzest/query/operators/limit.py +4 -4
palimpzest/query/operators/mixture_of_agents.py +246 -0
palimpzest/query/operators/physical.py +25 -2
palimpzest/query/operators/project.py +4 -4
palimpzest/query/operators/{rag_convert.py → rag.py} +202 -5
palimpzest/query/operators/retrieve.py +10 -9
palimpzest/query/operators/scan.py +9 -10
palimpzest/query/operators/search.py +18 -24
palimpzest/query/operators/split.py +321 -0
palimpzest/query/optimizer/__init__.py +12 -8
palimpzest/query/optimizer/optimizer.py +12 -10
palimpzest/query/optimizer/rules.py +201 -108
palimpzest/query/optimizer/tasks.py +18 -6
palimpzest/validator/validator.py +7 -9
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/METADATA +3 -8
palimpzest-0.8.3.dist-info/RECORD +95 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
palimpzest/prompts/util_phrases.py +0 -19
palimpzest/query/operators/critique_and_refine_convert.py +0 -113
palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
palimpzest/query/operators/split_convert.py +0 -170
palimpzest-0.8.2.dist-info/RECORD +0 -95
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/WHEEL +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/top_level.txt +0 -0

palimpzest/prompts/split_merge_prompts.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """This file contains prompts for SplitConvert aggregator operations."""
 ### SYSTEM PROMPTS ###
-COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
+MAP_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
 You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
 It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
@@ -34,8 +34,38 @@ ANSWER:
 ---
 """
+FILTER_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
+You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
+It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
+You will be provided with a description of each input field and the filter condition.
+Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
+An example is shown below:
+---
+CHUNK 1 OUTPUT: The context describes Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, who is widely recognized as a foundational figure in computer science. Therefore, the answer is TRUE.
+CHUNK 2 OUTPUT: Based on the context provided, Ada Lovelace is indeed a foundational computer scientist, therefore the answer is TRUE.
+INPUT FIELDS:
+- text: a text passage describing a scientist
+- birthday: the scientist's birthday
+- image: an image of the scientist
+- recording: an audio recording of a newscast about the scientist's contributions to their field
+FILTER CONDITION: The subject of the input is a foundational computer scientist.
+Let's think step-by-step in order to answer the question.
+REASONING: Looking at both chunk outputs, they agree that the subject is a foundational computer scientist. Both outputs provide consistent evidence supporting this conclusion.
+ANSWER: TRUE
+---
+"""
 ### USER / INSTANCE-SPECIFIC PROMPTS ###
-COT_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
+MAP_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
 You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
 It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
@@ -54,3 +84,22 @@ OUTPUT FIELDS:
 Let's think step-by-step in order to answer the question.
 REASONING: """
+FILTER_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
+You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
+It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
+You will be provided with a description of each input field and the filter condition.
+Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
+---
+{chunk_outputs}
+INPUT FIELDS:
+{input_fields_desc}
+FILTER CONDITION: {filter_condition}
+Let's think step-by-step in order to answer the question.
+REASONING: """

palimpzest/prompts/split_proposer_prompts.py CHANGED Viewed

@@ -1,8 +1,8 @@
-"""This file contains prompts for SplitConvert operations on text inputs."""
+"""This file contains prompts for SplitAndMerge operations."""
-### BASE PROMPTS ###
-COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
-You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
+### SYSTEM PROMPTS ###
+MAP_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and a set of output fields to generate. Your task is to generate a detailed and succinct analysis describing what you believe is the correct value for each output field.
 Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
 You will be provided with a description of each input field and each output field.
@@ -16,7 +16,7 @@ OUTPUT FIELDS:
 {example_output_fields}
 CONTEXT:
-{example_context}
+{{{example_context}}}{image_disclaimer}{audio_disclaimer}
 Let's think step-by-step in order to answer the question.
@@ -24,7 +24,30 @@ ANSWER: {example_answer}
 ---
 """
-COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+FILTER_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
+Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
+You will be provided with a description of each input field.
+An example is shown below:
+---
+INPUT FIELDS:
+{example_input_fields}
+CONTEXT:
+{{{example_context}}}{image_disclaimer}{audio_disclaimer}
+FILTER CONDITION: {example_filter_condition}
+Let's think step-by-step in order to answer the question.
+ANSWER: {example_answer}
+---
+"""
+### USER / INSTANCE-SPECIFIC PROMPTS ###
+MAP_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
 Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
 {desc_section}
@@ -37,19 +60,28 @@ OUTPUT FIELDS:
 {output_fields_desc}
 CONTEXT:
-{context}
+{context}<<image-audio-placeholder>>
 Let's think step-by-step in order to answer the question.
 ANSWER: """
+FILTER_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
+Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
+{desc_section}
+You will be provided with a description of each input field.
+An example is shown below:
+---
+INPUT FIELDS:
+{input_fields_desc}
+CONTEXT:
+{context}<<image-audio-placeholder>>
+FILTER CONDITION: {filter_condition}
-### TEMPLATE INPUTS ###
-SPLIT_PROPOSER_JOB_INSTRUCTION = "produce an answer to a question"
-SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS = """- text: a text passage describing scientists"""
-SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS = """- name: the list of names for each scientist mentioned in the text
-- field_of_study: a list with the field of study for each scientist"""
-SPLIT_PROPOSER_EXAMPLE_CONTEXT = """{{
-  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, born December 10, 1815 was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation."
-}}"""
-SPLIT_PROPOSER_EXAMPLE_ANSWER = """the text passage mentions the scientists "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and "Charles Babbage", both of whom were mathematicians. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"]."""
+Let's think step-by-step in order to answer the question.
+ANSWER: """

palimpzest/prompts/utils.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""This file contains utility format strings which are templated into many of our prompts."""
+### FORMATTING INSTRUCTIONS ###
+ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
+ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
+### USER-PROVIDED DESCRIPTION FOR MAPS / FILTERS / JOINS ###
+DESC_SECTION = """
+The user has additionally provided you with this description of the task you need to perform:
+{desc}
+"""
+### JOB INSTRUCTIONS ###
+MAP_JOB_INSTRUCTION = """analyze input {modalities} in order to produce a JSON object"""
+FILTER_JOB_INSTRUCTION = """analyze input {modalities} in order to answer a TRUE / FALSE question"""
+JOIN_JOB_INSTRUCTION = """analyze input {modalities} in order to determine whether two data records satisfy a join condition"""
+PROPOSER_JOB_INSTRUCTION = """analyze input {modalities} in order to produce an answer to a question"""
+### FILTER / JOIN CONDITIONS ###
+EXAMPLE_FILTER_CONDITION = "The subject of the input is a foundational computer scientist."
+EXAMPLE_JOIN_CONDITION = "The two inputs are scientists in the same academic field."
+### EXAMPLE INPUT FIELDS ###
+TEXT_EXAMPLE_INPUT_FIELDS = """
+- text: a text passage describing a scientist
+- birthday: the scientist's birthday
+"""
+IMAGE_EXAMPLE_INPUT_FIELDS = """
+- image: an image of the scientist
+- photographer: the photographer of the image
+"""
+AUDIO_EXAMPLE_INPUT_FIELDS = """
+- recording: an audio recording of a newscast about the scientist's contributions to their field
+- speaker: the speaker in the recording
+"""
+RIGHT_TEXT_EXAMPLE_INPUT_FIELDS = """
+- contents: the contents of a text file
+"""
+RIGHT_IMAGE_EXAMPLE_INPUT_FIELDS = """
+- headshot: a headshot of a famous scientist
+"""
+RIGHT_AUDIO_EXAMPLE_INPUT_FIELDS = """
+- podcast: an audio recording of a podcast about historic scientists
+"""
+### EXAMPLE OUTPUT FIELDS ###
+TEXT_EXAMPLE_OUTPUT_FIELDS = """- name: the name of the scientist
+- birth_year: the year the scientist was born"""
+IMAGE_EXAMPLE_OUTPUT_FIELDS = """- is_bald: true if the scientist is bald and false otherwise"""
+AUDIO_EXAMPLE_OUTPUT_FIELDS = """- birthplace: the city where the scientist was born"""
+### EXAMPLE CONTEXTS ###
+TEXT_EXAMPLE_CONTEXT = """
+  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
+  "birthday": "December 10, 1815"
+"""
+IMAGE_EXAMPLE_CONTEXT = """
+  "image": <bytes>,
+  "photographer": "CameraEnthusiast1"
+"""
+AUDIO_EXAMPLE_CONTEXT = """
+  "recording": <bytes>,
+  "speaker": "Walter Cronkite"
+"""
+RIGHT_TEXT_EXAMPLE_CONTEXT = """
+  "content": "Alan Turing was a pioneering computer scientist and mathematician. He is widely considered to be the father of theoretical computer science and artificial intelligence."
+"""
+RIGHT_IMAGE_EXAMPLE_CONTEXT = """
+  "headshot": <bytes>
+"""
+RIGHT_AUDIO_EXAMPLE_CONTEXT = """
+  "podcast": <bytes>
+"""
+### DISCLAIMERS ###
+IMAGE_DISCLAIMER = """
+\n<image content provided here; assume in this example the image shows Ada Lovelace wearing a hat on top of her hair>
+"""
+AUDIO_DISCLAIMER = """
+\n<audio content provided here; assume in this example the recording is about Ada Lovelace's upbringing in London>
+"""
+RIGHT_IMAGE_DISCLAIMER = """
+\n<image content provided here; assume in this example the image shows Alan Turing working at his desk>
+"""
+RIGHT_AUDIO_DISCLAIMER = """
+\n<audio content provided here; assume in this example the podcast is discussing Alan Turing's work on the Enigma code>
+"""
+### EXAMPLE REASONINGS ###
+TEXT_EXAMPLE_REASONING = """The text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
+IMAGE_EXAMPLE_REASONING = """The image shows hair on top of the scientist's head, so the is_bald field should be false."""
+AUDIO_EXAMPLE_REASONING = """The newscast recording discusses Ada Lovelace's upbringing in London, so the birthplace field should be "London"."""
+FILTER_EXAMPLE_REASONING = """Ada Lovelace is a foundational computer scientist, therefore the answer is TRUE."""
+JOIN_EXAMPLE_REASONING = """The subject of the left record is Ada Lovelace and the subject of the right record is Alan Turing. Since both inputs are about computer scientists, they satisfy the join condition. Therefore, the answer is TRUE."""
+### EXAMPLE ANSWERS ###
+TEXT_EXAMPLE_ANSWER = """
+  "name": "Augusta Ada King",
+  "birth_year": 1815
+"""
+IMAGE_EXAMPLE_ANSWER = """
+  "is_bald": false,
+"""
+AUDIO_EXAMPLE_ANSWER = """
+  "birthplace": "London",
+"""
+TEXT_SENTENCE_EXAMPLE_ANSWER = """the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
+IMAGE_SENTENCE_EXAMPLE_ANSWER = """The image shows hair on top of the woman's head, so the is_bald field should be false."""
+AUDIO_SENTENCE_EXAMPLE_ANSWER = """The newscast recording discusses Ada Lovelace's upbringing in London, so her birthplace is "London"."""

palimpzest/query/execution/all_sample_execution_strategy.py CHANGED Viewed

@@ -146,7 +146,7 @@ class OpSet:
             input = []
             max_quality_record_set = self.pick_highest_quality_output(record_sets)
             for record in max_quality_record_set:
-                input.append(record if record.passed_operator else None)
+                input.append(record if record._passed_operator else None)
             self.source_indices_to_inputs[source_idx] = input

palimpzest/query/execution/execution_strategy.py CHANGED Viewed

@@ -182,7 +182,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                     elif isinstance(op, LLMFilter):
                         filter_str = op.filter_obj.filter_condition
                         input_record: DataRecord = record_set.input
-                        output = record_set.data_records[0].passed_operator
+                        output = record_set.data_records[0]._passed_operator
                         full_hash = f"{filter_str}{hash(input_record)}"
                         if full_hash not in full_hashes:
                             full_hash_to_bool_output[full_hash] = output
@@ -195,7 +195,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                         for left_idx, left_input_record in enumerate(record_set.input[0]):
                             for right_idx, right_input_record in enumerate(record_set.input[1]):
                                 record_idx = left_idx * len(record_set.input[1]) + right_idx
-                                output = record_set.data_records[record_idx].passed_operator
+                                output = record_set.data_records[record_idx]._passed_operator
                                 full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
                                 if full_hash not in full_hashes:
                                     full_hash_to_bool_output[full_hash] = output
@@ -246,7 +246,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                 elif isinstance(op, LLMFilter):
                     filter_str = op.filter_obj.filter_condition
                     input_record: DataRecord = record_set.input
-                    output = record_set.data_records[0].passed_operator
+                    output = record_set.data_records[0]._passed_operator
                     full_hash = f"{filter_str}{hash(input_record)}"
                     if output == full_hash_to_bool_output[full_hash]:
                         record_set.record_op_stats[0].quality = full_hash_to_score[full_hash]
@@ -258,7 +258,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                     for left_idx, left_input_record in enumerate(record_set.input[0]):
                         for right_idx, right_input_record in enumerate(record_set.input[1]):
                             record_idx = left_idx * len(record_set.input[1]) + right_idx
-                            output = record_set.data_records[record_idx].passed_operator
+                            output = record_set.data_records[record_idx]._passed_operator
                             full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
                             if output == full_hash_to_bool_output[full_hash]:
                                 record_set.record_op_stats[record_idx].quality = full_hash_to_score[full_hash]

palimpzest/query/execution/mab_execution_strategy.py CHANGED Viewed

@@ -608,11 +608,10 @@ class OpFrontier:
             input = []
             max_quality_record_set = self.pick_highest_quality_output(record_sets)
             for record in max_quality_record_set:
-                input.append(record if record.passed_operator else None)
+                input.append(record if record._passed_operator else None)
             self.source_indices_to_inputs[source_unique_logical_op_id][source_indices] = input
 class MABExecutionStrategy(SentinelExecutionStrategy):
     """
     This class implements the Multi-Armed Bandit (MAB) execution strategy for SentinelQueryProcessors.

palimpzest/query/execution/parallel_execution_strategy.py CHANGED Viewed

@@ -61,8 +61,8 @@ class ParallelExecutionStrategy(ExecutionStrategy):
             output = future.result()
             record_set, num_inputs_processed = output if self.is_join_op[unique_full_op_id] else (output, 1)
-            # record set can be None if one side of join has no input records yet
-            if record_set is None:
+            # record set can be empty if one side of join has no input records yet
+            if len(record_set) == 0:
                 continue
             # otherwise, process records and their stats
@@ -77,7 +77,7 @@ class ParallelExecutionStrategy(ExecutionStrategy):
             plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
             # add records which aren't filtered to the output records
-            output_records.extend([record for record in records if record.passed_operator])
+            output_records.extend([record for record in records if record._passed_operator])
         # update the progress manager
         if total_inputs_processed > 0:

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -52,7 +52,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
                 record_set = operator(candidates=input_queues[unique_full_op_id][source_unique_full_op_id])
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
-                num_outputs = sum(record.passed_operator for record in records)
+                num_outputs = sum(record._passed_operator for record in records)
                 # update the progress manager
                 self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -70,7 +70,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
                 record_set, num_inputs_processed = operator(left_input_records, right_input_records)
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
-                num_outputs = sum(record.passed_operator for record in records)
+                num_outputs = sum(record._passed_operator for record in records)
                 # update the progress manager
                 self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -82,7 +82,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
                     record_set = operator(input_record)
                     records.extend(record_set.data_records)
                     record_op_stats.extend(record_set.record_op_stats)
-                    num_outputs = sum(record.passed_operator for record in record_set.data_records)
+                    num_outputs = sum(record._passed_operator for record in record_set.data_records)
                     # update the progress manager
                     self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -95,7 +95,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
             plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
             # update next input_queue (if it exists)
-            output_records = [record for record in records if record.passed_operator]
+            output_records = [record for record in records if record._passed_operator]
             next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
             if next_unique_full_op_id is not None:
                 input_queues[next_unique_full_op_id][unique_full_op_id] = output_records
@@ -207,7 +207,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                     record_set = operator(candidates=input_records)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
-                    num_outputs = sum(record.passed_operator for record in records)
+                    num_outputs = sum(record._passed_operator for record in records)
                     # update the progress manager
                     self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -225,7 +225,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                     record_set, num_inputs_processed = operator(left_input_records, right_input_records)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
-                    num_outputs = sum(record.passed_operator for record in records)
+                    num_outputs = sum(record._passed_operator for record in records)
                     # update the progress manager
                     self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -237,7 +237,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                     record_set = operator(input_record)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
-                    num_outputs = sum(record.passed_operator for record in records)
+                    num_outputs = sum(record._passed_operator for record in records)
                     # update the progress manager
                     self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -246,7 +246,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
                 # update next input_queue or final_output_records
-                output_records = [record for record in records if record.passed_operator]
+                output_records = [record for record in records if record._passed_operator]
                 next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
                 if next_unique_full_op_id is not None:
                     input_queues[next_unique_full_op_id][unique_full_op_id].extend(output_records)

palimpzest/query/generators/generators.py CHANGED Viewed

@@ -101,7 +101,7 @@ def get_json_from_answer(answer: str, model: Model, cardinality: Cardinality) ->
 # TODO: make sure answer parsing works with custom prompts / parsers (can defer this)
 class Generator(Generic[ContextType, InputType]):
     """
-    Abstract base class for Generators.
+    Class for generating new fields for a record using an LLM.
     """
     def __init__(
@@ -181,11 +181,11 @@ class Generator(Generic[ContextType, InputType]):
         return None
-    def _check_bool_answer_text(self, answer_text: str) -> dict | None:
+    def _check_bool_answer_text(self, answer_text: str, throw_exception: bool=False) -> dict | None:
         """
         Return {"passed_operator": True} if and only if "true" is in the answer text.
         Return {"passed_operator": False} if and only if "false" is in the answer text.
-        Otherwise, return None.
+        Otherwise, raise an exception.
         """
         # NOTE: we may be able to eliminate this condition by specifying this JSON output in the prompt;
         # however, that would also need to coincide with a change to allow the parse_answer_fn to set "passed_operator"
@@ -194,6 +194,9 @@ class Generator(Generic[ContextType, InputType]):
         elif "false" in answer_text.lower():
             return {"passed_operator": False}
+        if throw_exception:
+            raise Exception(f"Could not parse answer from completion text: {answer_text}")
         return None
     def _parse_convert_answer(self, completion_text: str, fields: dict[str, FieldInfo], json_output: bool) -> dict[str, list]:
@@ -235,7 +238,7 @@ class Generator(Generic[ContextType, InputType]):
         return self._check_convert_answer_text(completion_text, fields, throw_exception=True)
-    def _parse_bool_answer(self, completion_text: str) -> dict[str, list]:
+    def _parse_bool_answer(self, completion_text: str, json_output: bool) -> dict[str, list]:
         """Extract the answer from the completion object for filter and join operations."""
         # if the model followed the default instructions, the completion text will place
         # its answer between "ANSWER:" and "---"
@@ -243,6 +246,12 @@ class Generator(Generic[ContextType, InputType]):
         matches = regex.findall(completion_text)
         if len(matches) > 0:
             answer_text = matches[0].strip()
+            # if we don't expect a JSON output, return the answer text as is
+            if not json_output:
+                return answer_text
+            # otherwise, try to parse the answer text into a JSON object
             field_answers = self._check_bool_answer_text(answer_text)
             if field_answers is not None:
                 return field_answers
@@ -252,16 +261,21 @@ class Generator(Generic[ContextType, InputType]):
         matches = regex.findall(completion_text)
         if len(matches) > 0:
             answer_text = matches[0].strip()
+            # if we don't expect a JSON output, return the answer text as is
+            if not json_output:
+                return answer_text
+            # otherwise, try to parse the answer text into a JSON object
             field_answers = self._check_bool_answer_text(answer_text)
             if field_answers is not None:
                 return field_answers
-        # finally, try taking all of the text; throw an exception if this doesn't work
-        field_answers = self._check_bool_answer_text(completion_text)
-        if field_answers is None:
-            raise Exception(f"Could not parse answer from completion text: {completion_text}")
+        # finally, try taking all of the text; for JSON output, throw an exception if parsing fails
+        if not json_output:
+            return completion_text
-        return field_answers
+        return self._check_bool_answer_text(completion_text, throw_exception=True)
     def _parse_answer(self, completion_text: str, fields: dict[str, FieldInfo] | None, json_output: bool, **kwargs) -> dict[str, list]:
         """Extract the answer from the completion object."""
@@ -275,8 +289,8 @@ class Generator(Generic[ContextType, InputType]):
         # extract the per-field answers from the completion text
         field_answers = (
-            self._parse_bool_answer(completion_text)
-            if self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()
+            self._parse_bool_answer(completion_text, json_output)
+            if self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()
             else self._parse_convert_answer(completion_text, fields, json_output)
         )
@@ -299,6 +313,7 @@ class Generator(Generic[ContextType, InputType]):
         # generate a list of messages which can be used to construct a payload
         messages = self.prompt_factory.create_messages(candidate, fields, right_candidate, **kwargs)
+        is_audio_op = any(msg.get("type") == "input_audio" for msg in messages)
         # generate the text completion
         start_time = time.time()
@@ -307,7 +322,7 @@ class Generator(Generic[ContextType, InputType]):
             completion_kwargs = {}
             if not self.model.is_o_model() and not self.model.is_gpt_5_model():
                 completion_kwargs = {"temperature": kwargs.get("temperature", 0.0), **completion_kwargs}
-            if self.prompt_strategy.is_audio_prompt():
+            if is_audio_op:
                 completion_kwargs = {"modalities": ["text"], **completion_kwargs}
             if self.model.is_reasoning_model():
                 if self.model.is_vertex_model():
@@ -330,11 +345,10 @@ class Generator(Generic[ContextType, InputType]):
         # if there's an error generating the completion, we have to return an empty answer
         # and can only account for the time spent performing the failed generation
         except Exception as e:
-            print(f"Error generating completion: {e}")
             logger.error(f"Error generating completion: {e}")
             field_answers = (
                 {"passed_operator": False}
-                if self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()
+                if self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()
                 else {field_name: None for field_name in fields}
             )
             reasoning = None
@@ -360,7 +374,7 @@ class Generator(Generic[ContextType, InputType]):
             #       for now, we only use tokens from prompt_token_details if it's an audio prompt
             # get output tokens (all text) and input tokens by modality
             output_tokens = usage["completion_tokens"]
-            if self.prompt_strategy.is_audio_prompt():
+            if is_audio_op:
                 input_audio_tokens = usage["prompt_tokens_details"].get("audio_tokens", 0)
                 input_text_tokens = usage["prompt_tokens_details"].get("text_tokens", 0)
                 input_image_tokens = 0
@@ -413,9 +427,9 @@ class Generator(Generic[ContextType, InputType]):
         # parse field answers
         field_answers = None
-        if fields is not None and (self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()):
+        if fields is not None and (self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()):
             field_answers = {"passed_operator": False}
-        elif fields is not None and not (self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()):
+        elif fields is not None and not (self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()):
             field_answers = {field_name: None for field_name in fields}
         try:
             field_answers = self._parse_answer(completion_text, fields, json_output, **kwargs)

palimpzest/query/operators/__init__.py CHANGED Viewed

@@ -6,6 +6,8 @@ from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
 from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
 from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
 from palimpzest.query.operators.convert import NonLLMConvert as _NonLLMConvert
+from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert as _CritiqueAndRefineConvert
+from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineFilter as _CritiqueAndRefineFilter
 from palimpzest.query.operators.distinct import DistinctOp as _DistinctOp
 from palimpzest.query.operators.filter import FilterOp as _FilterOp
 from palimpzest.query.operators.filter import LLMFilter as _LLMFilter
@@ -46,12 +48,17 @@ from palimpzest.query.operators.logical import (
 from palimpzest.query.operators.logical import (
     RetrieveScan as _RetrieveScan,
 )
-from palimpzest.query.operators.mixture_of_agents_convert import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
+from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
+from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsFilter as _MixtureOfAgentsFilter
 from palimpzest.query.operators.physical import PhysicalOperator as _PhysicalOperator
 from palimpzest.query.operators.project import ProjectOp as _ProjectOp
+from palimpzest.query.operators.rag import RAGConvert as _RAGConvert
+from palimpzest.query.operators.rag import RAGFilter as _RAGFilter
 from palimpzest.query.operators.retrieve import RetrieveOp as _RetrieveOp
 from palimpzest.query.operators.scan import MarshalAndScanDataOp as _MarshalAndScanDataOp
 from palimpzest.query.operators.scan import ScanPhysicalOp as _ScanPhysicalOp
+from palimpzest.query.operators.split import SplitConvert as _SplitConvert
+from palimpzest.query.operators.split import SplitFilter as _SplitFilter
 LOGICAL_OPERATORS = [
     _LogicalOperator,
@@ -72,6 +79,8 @@ PHYSICAL_OPERATORS = (
     [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp]
     # convert
     + [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
+    # critique and refine
+    + [_CritiqueAndRefineConvert, _CritiqueAndRefineFilter]
     # distinct
     + [_DistinctOp]
     # scan
@@ -83,13 +92,17 @@ PHYSICAL_OPERATORS = (
     # limit
     + [_LimitScanOp]
     # mixture-of-agents
-    + [_MixtureOfAgentsConvert]
+    + [_MixtureOfAgentsConvert, _MixtureOfAgentsFilter]
     # physical
     + [_PhysicalOperator]
     # project
     + [_ProjectOp]
+    # rag
+    + [_RAGConvert, _RAGFilter]
     # retrieve
     + [_RetrieveOp]
+    # split
+    + [_SplitConvert, _SplitFilter]
 )
 __all__ = [

palimpzest 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl

palimpzest 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl