palimpzest 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. palimpzest/constants.py +38 -62
  2. palimpzest/core/data/iter_dataset.py +5 -5
  3. palimpzest/core/elements/groupbysig.py +1 -1
  4. palimpzest/core/elements/records.py +91 -109
  5. palimpzest/core/lib/schemas.py +23 -0
  6. palimpzest/core/models.py +3 -3
  7. palimpzest/prompts/__init__.py +2 -6
  8. palimpzest/prompts/convert_prompts.py +10 -66
  9. palimpzest/prompts/critique_and_refine_prompts.py +66 -0
  10. palimpzest/prompts/filter_prompts.py +8 -46
  11. palimpzest/prompts/join_prompts.py +12 -75
  12. palimpzest/prompts/{moa_aggregator_convert_prompts.py → moa_aggregator_prompts.py} +51 -2
  13. palimpzest/prompts/moa_proposer_prompts.py +87 -0
  14. palimpzest/prompts/prompt_factory.py +351 -479
  15. palimpzest/prompts/split_merge_prompts.py +51 -2
  16. palimpzest/prompts/split_proposer_prompts.py +48 -16
  17. palimpzest/prompts/utils.py +109 -0
  18. palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  19. palimpzest/query/execution/execution_strategy.py +4 -4
  20. palimpzest/query/execution/mab_execution_strategy.py +1 -2
  21. palimpzest/query/execution/parallel_execution_strategy.py +3 -3
  22. palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
  23. palimpzest/query/generators/generators.py +31 -17
  24. palimpzest/query/operators/__init__.py +15 -2
  25. palimpzest/query/operators/aggregate.py +21 -19
  26. palimpzest/query/operators/compute.py +6 -8
  27. palimpzest/query/operators/convert.py +12 -37
  28. palimpzest/query/operators/critique_and_refine.py +194 -0
  29. palimpzest/query/operators/distinct.py +7 -7
  30. palimpzest/query/operators/filter.py +13 -25
  31. palimpzest/query/operators/join.py +321 -192
  32. palimpzest/query/operators/limit.py +4 -4
  33. palimpzest/query/operators/mixture_of_agents.py +246 -0
  34. palimpzest/query/operators/physical.py +25 -2
  35. palimpzest/query/operators/project.py +4 -4
  36. palimpzest/query/operators/{rag_convert.py → rag.py} +202 -5
  37. palimpzest/query/operators/retrieve.py +10 -9
  38. palimpzest/query/operators/scan.py +9 -10
  39. palimpzest/query/operators/search.py +18 -24
  40. palimpzest/query/operators/split.py +321 -0
  41. palimpzest/query/optimizer/__init__.py +12 -8
  42. palimpzest/query/optimizer/optimizer.py +12 -10
  43. palimpzest/query/optimizer/rules.py +201 -108
  44. palimpzest/query/optimizer/tasks.py +18 -6
  45. palimpzest/validator/validator.py +7 -9
  46. {palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/METADATA +3 -8
  47. palimpzest-0.8.3.dist-info/RECORD +95 -0
  48. palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
  49. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
  50. palimpzest/prompts/util_phrases.py +0 -19
  51. palimpzest/query/operators/critique_and_refine_convert.py +0 -113
  52. palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
  53. palimpzest/query/operators/split_convert.py +0 -170
  54. palimpzest-0.8.2.dist-info/RECORD +0 -95
  55. {palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/WHEEL +0 -0
  56. {palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/licenses/LICENSE +0 -0
  57. {palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  """This file contains prompts for SplitConvert aggregator operations."""
2
2
 
3
3
  ### SYSTEM PROMPTS ###
4
- COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
4
+ MAP_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
5
5
  You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
6
6
  It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
7
7
 
@@ -34,8 +34,38 @@ ANSWER:
34
34
  ---
35
35
  """
36
36
 
37
+ FILTER_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
38
+ You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
39
+ It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
40
+
41
+ You will be provided with a description of each input field and the filter condition.
42
+
43
+ Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
44
+
45
+ An example is shown below:
46
+ ---
47
+ CHUNK 1 OUTPUT: The context describes Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, who is widely recognized as a foundational figure in computer science. Therefore, the answer is TRUE.
48
+
49
+ CHUNK 2 OUTPUT: Based on the context provided, Ada Lovelace is indeed a foundational computer scientist, therefore the answer is TRUE.
50
+
51
+ INPUT FIELDS:
52
+ - text: a text passage describing a scientist
53
+ - birthday: the scientist's birthday
54
+ - image: an image of the scientist
55
+ - recording: an audio recording of a newscast about the scientist's contributions to their field
56
+
57
+ FILTER CONDITION: The subject of the input is a foundational computer scientist.
58
+
59
+ Let's think step-by-step in order to answer the question.
60
+
61
+ REASONING: Looking at both chunk outputs, they agree that the subject is a foundational computer scientist. Both outputs provide consistent evidence supporting this conclusion.
62
+
63
+ ANSWER: TRUE
64
+ ---
65
+ """
66
+
37
67
  ### USER / INSTANCE-SPECIFIC PROMPTS ###
38
- COT_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
68
+ MAP_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
39
69
  You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
40
70
  It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
41
71
 
@@ -54,3 +84,22 @@ OUTPUT FIELDS:
54
84
  Let's think step-by-step in order to answer the question.
55
85
 
56
86
  REASONING: """
87
+
88
+ FILTER_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
89
+ You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
90
+ It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
91
+
92
+ You will be provided with a description of each input field and the filter condition.
93
+
94
+ Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
95
+ ---
96
+ {chunk_outputs}
97
+
98
+ INPUT FIELDS:
99
+ {input_fields_desc}
100
+
101
+ FILTER CONDITION: {filter_condition}
102
+
103
+ Let's think step-by-step in order to answer the question.
104
+
105
+ REASONING: """
@@ -1,8 +1,8 @@
1
- """This file contains prompts for SplitConvert operations on text inputs."""
1
+ """This file contains prompts for SplitAndMerge operations."""
2
2
 
3
- ### BASE PROMPTS ###
4
- COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
5
- You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
3
+ ### SYSTEM PROMPTS ###
4
+ MAP_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
5
+ You will be presented with a context and a set of output fields to generate. Your task is to generate a detailed and succinct analysis describing what you believe is the correct value for each output field.
6
6
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
7
7
 
8
8
  You will be provided with a description of each input field and each output field.
@@ -16,7 +16,7 @@ OUTPUT FIELDS:
16
16
  {example_output_fields}
17
17
 
18
18
  CONTEXT:
19
- {example_context}
19
+ {{{example_context}}}{image_disclaimer}{audio_disclaimer}
20
20
 
21
21
  Let's think step-by-step in order to answer the question.
22
22
 
@@ -24,7 +24,30 @@ ANSWER: {example_answer}
24
24
  ---
25
25
  """
26
26
 
27
- COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
27
+ FILTER_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
+ You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
29
+ Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
30
+
31
+ You will be provided with a description of each input field.
32
+
33
+ An example is shown below:
34
+ ---
35
+ INPUT FIELDS:
36
+ {example_input_fields}
37
+
38
+ CONTEXT:
39
+ {{{example_context}}}{image_disclaimer}{audio_disclaimer}
40
+
41
+ FILTER CONDITION: {example_filter_condition}
42
+
43
+ Let's think step-by-step in order to answer the question.
44
+
45
+ ANSWER: {example_answer}
46
+ ---
47
+ """
48
+
49
+ ### USER / INSTANCE-SPECIFIC PROMPTS ###
50
+ MAP_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
51
  You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
29
52
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
30
53
  {desc_section}
@@ -37,19 +60,28 @@ OUTPUT FIELDS:
37
60
  {output_fields_desc}
38
61
 
39
62
  CONTEXT:
40
- {context}
63
+ {context}<<image-audio-placeholder>>
41
64
 
42
65
  Let's think step-by-step in order to answer the question.
43
66
 
44
67
  ANSWER: """
45
68
 
69
+ FILTER_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
70
+ You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
71
+ Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
72
+ {desc_section}
73
+ You will be provided with a description of each input field.
74
+
75
+ An example is shown below:
76
+ ---
77
+ INPUT FIELDS:
78
+ {input_fields_desc}
79
+
80
+ CONTEXT:
81
+ {context}<<image-audio-placeholder>>
82
+
83
+ FILTER CONDITION: {filter_condition}
46
84
 
47
- ### TEMPLATE INPUTS ###
48
- SPLIT_PROPOSER_JOB_INSTRUCTION = "produce an answer to a question"
49
- SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS = """- text: a text passage describing scientists"""
50
- SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS = """- name: the list of names for each scientist mentioned in the text
51
- - field_of_study: a list with the field of study for each scientist"""
52
- SPLIT_PROPOSER_EXAMPLE_CONTEXT = """{{
53
- "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, born December 10, 1815 was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation."
54
- }}"""
55
- SPLIT_PROPOSER_EXAMPLE_ANSWER = """the text passage mentions the scientists "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and "Charles Babbage", both of whom were mathematicians. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"]."""
85
+ Let's think step-by-step in order to answer the question.
86
+
87
+ ANSWER: """
@@ -0,0 +1,109 @@
1
+ """This file contains utility format strings which are templated into many of our prompts."""
2
+
3
+ ### FORMATTING INSTRUCTIONS ###
4
+ ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
5
+ ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
6
+
7
+ ### USER-PROVIDED DESCRIPTION FOR MAPS / FILTERS / JOINS ###
8
+ DESC_SECTION = """
9
+ The user has additionally provided you with this description of the task you need to perform:
10
+ {desc}
11
+ """
12
+
13
+ ### JOB INSTRUCTIONS ###
14
+ MAP_JOB_INSTRUCTION = """analyze input {modalities} in order to produce a JSON object"""
15
+ FILTER_JOB_INSTRUCTION = """analyze input {modalities} in order to answer a TRUE / FALSE question"""
16
+ JOIN_JOB_INSTRUCTION = """analyze input {modalities} in order to determine whether two data records satisfy a join condition"""
17
+ PROPOSER_JOB_INSTRUCTION = """analyze input {modalities} in order to produce an answer to a question"""
18
+
19
+ ### FILTER / JOIN CONDITIONS ###
20
+ EXAMPLE_FILTER_CONDITION = "The subject of the input is a foundational computer scientist."
21
+ EXAMPLE_JOIN_CONDITION = "The two inputs are scientists in the same academic field."
22
+
23
+ ### EXAMPLE INPUT FIELDS ###
24
+ TEXT_EXAMPLE_INPUT_FIELDS = """
25
+ - text: a text passage describing a scientist
26
+ - birthday: the scientist's birthday
27
+ """
28
+ IMAGE_EXAMPLE_INPUT_FIELDS = """
29
+ - image: an image of the scientist
30
+ - photographer: the photographer of the image
31
+ """
32
+ AUDIO_EXAMPLE_INPUT_FIELDS = """
33
+ - recording: an audio recording of a newscast about the scientist's contributions to their field
34
+ - speaker: the speaker in the recording
35
+ """
36
+ RIGHT_TEXT_EXAMPLE_INPUT_FIELDS = """
37
+ - contents: the contents of a text file
38
+ """
39
+ RIGHT_IMAGE_EXAMPLE_INPUT_FIELDS = """
40
+ - headshot: a headshot of a famous scientist
41
+ """
42
+ RIGHT_AUDIO_EXAMPLE_INPUT_FIELDS = """
43
+ - podcast: an audio recording of a podcast about historic scientists
44
+ """
45
+
46
+ ### EXAMPLE OUTPUT FIELDS ###
47
+ TEXT_EXAMPLE_OUTPUT_FIELDS = """- name: the name of the scientist
48
+ - birth_year: the year the scientist was born"""
49
+ IMAGE_EXAMPLE_OUTPUT_FIELDS = """- is_bald: true if the scientist is bald and false otherwise"""
50
+ AUDIO_EXAMPLE_OUTPUT_FIELDS = """- birthplace: the city where the scientist was born"""
51
+
52
+ ### EXAMPLE CONTEXTS ###
53
+ TEXT_EXAMPLE_CONTEXT = """
54
+ "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
55
+ "birthday": "December 10, 1815"
56
+ """
57
+ IMAGE_EXAMPLE_CONTEXT = """
58
+ "image": <bytes>,
59
+ "photographer": "CameraEnthusiast1"
60
+ """
61
+ AUDIO_EXAMPLE_CONTEXT = """
62
+ "recording": <bytes>,
63
+ "speaker": "Walter Cronkite"
64
+ """
65
+ RIGHT_TEXT_EXAMPLE_CONTEXT = """
66
+ "content": "Alan Turing was a pioneering computer scientist and mathematician. He is widely considered to be the father of theoretical computer science and artificial intelligence."
67
+ """
68
+ RIGHT_IMAGE_EXAMPLE_CONTEXT = """
69
+ "headshot": <bytes>
70
+ """
71
+ RIGHT_AUDIO_EXAMPLE_CONTEXT = """
72
+ "podcast": <bytes>
73
+ """
74
+
75
+ ### DISCLAIMERS ###
76
+ IMAGE_DISCLAIMER = """
77
+ \n<image content provided here; assume in this example the image shows Ada Lovelace wearing a hat on top of her hair>
78
+ """
79
+ AUDIO_DISCLAIMER = """
80
+ \n<audio content provided here; assume in this example the recording is about Ada Lovelace's upbringing in London>
81
+ """
82
+ RIGHT_IMAGE_DISCLAIMER = """
83
+ \n<image content provided here; assume in this example the image shows Alan Turing working at his desk>
84
+ """
85
+ RIGHT_AUDIO_DISCLAIMER = """
86
+ \n<audio content provided here; assume in this example the podcast is discussing Alan Turing's work on the Enigma code>
87
+ """
88
+
89
+ ### EXAMPLE REASONINGS ###
90
+ TEXT_EXAMPLE_REASONING = """The text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
91
+ IMAGE_EXAMPLE_REASONING = """The image shows hair on top of the scientist's head, so the is_bald field should be false."""
92
+ AUDIO_EXAMPLE_REASONING = """The newscast recording discusses Ada Lovelace's upbringing in London, so the birthplace field should be "London"."""
93
+ FILTER_EXAMPLE_REASONING = """Ada Lovelace is a foundational computer scientist, therefore the answer is TRUE."""
94
+ JOIN_EXAMPLE_REASONING = """The subject of the left record is Ada Lovelace and the subject of the right record is Alan Turing. Since both inputs are about computer scientists, they satisfy the join condition. Therefore, the answer is TRUE."""
95
+
96
+ ### EXAMPLE ANSWERS ###
97
+ TEXT_EXAMPLE_ANSWER = """
98
+ "name": "Augusta Ada King",
99
+ "birth_year": 1815
100
+ """
101
+ IMAGE_EXAMPLE_ANSWER = """
102
+ "is_bald": false,
103
+ """
104
+ AUDIO_EXAMPLE_ANSWER = """
105
+ "birthplace": "London",
106
+ """
107
+ TEXT_SENTENCE_EXAMPLE_ANSWER = """the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
108
+ IMAGE_SENTENCE_EXAMPLE_ANSWER = """The image shows hair on top of the woman's head, so the is_bald field should be false."""
109
+ AUDIO_SENTENCE_EXAMPLE_ANSWER = """The newscast recording discusses Ada Lovelace's upbringing in London, so her birthplace is "London"."""
@@ -146,7 +146,7 @@ class OpSet:
146
146
  input = []
147
147
  max_quality_record_set = self.pick_highest_quality_output(record_sets)
148
148
  for record in max_quality_record_set:
149
- input.append(record if record.passed_operator else None)
149
+ input.append(record if record._passed_operator else None)
150
150
 
151
151
  self.source_indices_to_inputs[source_idx] = input
152
152
 
@@ -182,7 +182,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
182
182
  elif isinstance(op, LLMFilter):
183
183
  filter_str = op.filter_obj.filter_condition
184
184
  input_record: DataRecord = record_set.input
185
- output = record_set.data_records[0].passed_operator
185
+ output = record_set.data_records[0]._passed_operator
186
186
  full_hash = f"{filter_str}{hash(input_record)}"
187
187
  if full_hash not in full_hashes:
188
188
  full_hash_to_bool_output[full_hash] = output
@@ -195,7 +195,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
195
195
  for left_idx, left_input_record in enumerate(record_set.input[0]):
196
196
  for right_idx, right_input_record in enumerate(record_set.input[1]):
197
197
  record_idx = left_idx * len(record_set.input[1]) + right_idx
198
- output = record_set.data_records[record_idx].passed_operator
198
+ output = record_set.data_records[record_idx]._passed_operator
199
199
  full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
200
200
  if full_hash not in full_hashes:
201
201
  full_hash_to_bool_output[full_hash] = output
@@ -246,7 +246,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
246
246
  elif isinstance(op, LLMFilter):
247
247
  filter_str = op.filter_obj.filter_condition
248
248
  input_record: DataRecord = record_set.input
249
- output = record_set.data_records[0].passed_operator
249
+ output = record_set.data_records[0]._passed_operator
250
250
  full_hash = f"{filter_str}{hash(input_record)}"
251
251
  if output == full_hash_to_bool_output[full_hash]:
252
252
  record_set.record_op_stats[0].quality = full_hash_to_score[full_hash]
@@ -258,7 +258,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
258
258
  for left_idx, left_input_record in enumerate(record_set.input[0]):
259
259
  for right_idx, right_input_record in enumerate(record_set.input[1]):
260
260
  record_idx = left_idx * len(record_set.input[1]) + right_idx
261
- output = record_set.data_records[record_idx].passed_operator
261
+ output = record_set.data_records[record_idx]._passed_operator
262
262
  full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
263
263
  if output == full_hash_to_bool_output[full_hash]:
264
264
  record_set.record_op_stats[record_idx].quality = full_hash_to_score[full_hash]
@@ -608,11 +608,10 @@ class OpFrontier:
608
608
  input = []
609
609
  max_quality_record_set = self.pick_highest_quality_output(record_sets)
610
610
  for record in max_quality_record_set:
611
- input.append(record if record.passed_operator else None)
611
+ input.append(record if record._passed_operator else None)
612
612
 
613
613
  self.source_indices_to_inputs[source_unique_logical_op_id][source_indices] = input
614
614
 
615
-
616
615
  class MABExecutionStrategy(SentinelExecutionStrategy):
617
616
  """
618
617
  This class implements the Multi-Armed Bandit (MAB) execution strategy for SentinelQueryProcessors.
@@ -61,8 +61,8 @@ class ParallelExecutionStrategy(ExecutionStrategy):
61
61
  output = future.result()
62
62
  record_set, num_inputs_processed = output if self.is_join_op[unique_full_op_id] else (output, 1)
63
63
 
64
- # record set can be None if one side of join has no input records yet
65
- if record_set is None:
64
+ # record set can be empty if one side of join has no input records yet
65
+ if len(record_set) == 0:
66
66
  continue
67
67
 
68
68
  # otherwise, process records and their stats
@@ -77,7 +77,7 @@ class ParallelExecutionStrategy(ExecutionStrategy):
77
77
  plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
78
78
 
79
79
  # add records which aren't filtered to the output records
80
- output_records.extend([record for record in records if record.passed_operator])
80
+ output_records.extend([record for record in records if record._passed_operator])
81
81
 
82
82
  # update the progress manager
83
83
  if total_inputs_processed > 0:
@@ -52,7 +52,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
52
52
  record_set = operator(candidates=input_queues[unique_full_op_id][source_unique_full_op_id])
53
53
  records = record_set.data_records
54
54
  record_op_stats = record_set.record_op_stats
55
- num_outputs = sum(record.passed_operator for record in records)
55
+ num_outputs = sum(record._passed_operator for record in records)
56
56
 
57
57
  # update the progress manager
58
58
  self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -70,7 +70,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
70
70
  record_set, num_inputs_processed = operator(left_input_records, right_input_records)
71
71
  records = record_set.data_records
72
72
  record_op_stats = record_set.record_op_stats
73
- num_outputs = sum(record.passed_operator for record in records)
73
+ num_outputs = sum(record._passed_operator for record in records)
74
74
 
75
75
  # update the progress manager
76
76
  self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -82,7 +82,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
82
82
  record_set = operator(input_record)
83
83
  records.extend(record_set.data_records)
84
84
  record_op_stats.extend(record_set.record_op_stats)
85
- num_outputs = sum(record.passed_operator for record in record_set.data_records)
85
+ num_outputs = sum(record._passed_operator for record in record_set.data_records)
86
86
 
87
87
  # update the progress manager
88
88
  self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -95,7 +95,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
95
95
  plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
96
96
 
97
97
  # update next input_queue (if it exists)
98
- output_records = [record for record in records if record.passed_operator]
98
+ output_records = [record for record in records if record._passed_operator]
99
99
  next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
100
100
  if next_unique_full_op_id is not None:
101
101
  input_queues[next_unique_full_op_id][unique_full_op_id] = output_records
@@ -207,7 +207,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
207
207
  record_set = operator(candidates=input_records)
208
208
  records = record_set.data_records
209
209
  record_op_stats = record_set.record_op_stats
210
- num_outputs = sum(record.passed_operator for record in records)
210
+ num_outputs = sum(record._passed_operator for record in records)
211
211
 
212
212
  # update the progress manager
213
213
  self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -225,7 +225,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
225
225
  record_set, num_inputs_processed = operator(left_input_records, right_input_records)
226
226
  records = record_set.data_records
227
227
  record_op_stats = record_set.record_op_stats
228
- num_outputs = sum(record.passed_operator for record in records)
228
+ num_outputs = sum(record._passed_operator for record in records)
229
229
 
230
230
  # update the progress manager
231
231
  self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -237,7 +237,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
237
237
  record_set = operator(input_record)
238
238
  records = record_set.data_records
239
239
  record_op_stats = record_set.record_op_stats
240
- num_outputs = sum(record.passed_operator for record in records)
240
+ num_outputs = sum(record._passed_operator for record in records)
241
241
 
242
242
  # update the progress manager
243
243
  self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
@@ -246,7 +246,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
246
246
  plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
247
247
 
248
248
  # update next input_queue or final_output_records
249
- output_records = [record for record in records if record.passed_operator]
249
+ output_records = [record for record in records if record._passed_operator]
250
250
  next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
251
251
  if next_unique_full_op_id is not None:
252
252
  input_queues[next_unique_full_op_id][unique_full_op_id].extend(output_records)
@@ -101,7 +101,7 @@ def get_json_from_answer(answer: str, model: Model, cardinality: Cardinality) ->
101
101
  # TODO: make sure answer parsing works with custom prompts / parsers (can defer this)
102
102
  class Generator(Generic[ContextType, InputType]):
103
103
  """
104
- Abstract base class for Generators.
104
+ Class for generating new fields for a record using an LLM.
105
105
  """
106
106
 
107
107
  def __init__(
@@ -181,11 +181,11 @@ class Generator(Generic[ContextType, InputType]):
181
181
 
182
182
  return None
183
183
 
184
- def _check_bool_answer_text(self, answer_text: str) -> dict | None:
184
+ def _check_bool_answer_text(self, answer_text: str, throw_exception: bool=False) -> dict | None:
185
185
  """
186
186
  Return {"passed_operator": True} if and only if "true" is in the answer text.
187
187
  Return {"passed_operator": False} if and only if "false" is in the answer text.
188
- Otherwise, return None.
188
+ Otherwise, raise an exception.
189
189
  """
190
190
  # NOTE: we may be able to eliminate this condition by specifying this JSON output in the prompt;
191
191
  # however, that would also need to coincide with a change to allow the parse_answer_fn to set "passed_operator"
@@ -194,6 +194,9 @@ class Generator(Generic[ContextType, InputType]):
194
194
  elif "false" in answer_text.lower():
195
195
  return {"passed_operator": False}
196
196
 
197
+ if throw_exception:
198
+ raise Exception(f"Could not parse answer from completion text: {answer_text}")
199
+
197
200
  return None
198
201
 
199
202
  def _parse_convert_answer(self, completion_text: str, fields: dict[str, FieldInfo], json_output: bool) -> dict[str, list]:
@@ -235,7 +238,7 @@ class Generator(Generic[ContextType, InputType]):
235
238
 
236
239
  return self._check_convert_answer_text(completion_text, fields, throw_exception=True)
237
240
 
238
- def _parse_bool_answer(self, completion_text: str) -> dict[str, list]:
241
+ def _parse_bool_answer(self, completion_text: str, json_output: bool) -> dict[str, list]:
239
242
  """Extract the answer from the completion object for filter and join operations."""
240
243
  # if the model followed the default instructions, the completion text will place
241
244
  # its answer between "ANSWER:" and "---"
@@ -243,6 +246,12 @@ class Generator(Generic[ContextType, InputType]):
243
246
  matches = regex.findall(completion_text)
244
247
  if len(matches) > 0:
245
248
  answer_text = matches[0].strip()
249
+
250
+ # if we don't expect a JSON output, return the answer text as is
251
+ if not json_output:
252
+ return answer_text
253
+
254
+ # otherwise, try to parse the answer text into a JSON object
246
255
  field_answers = self._check_bool_answer_text(answer_text)
247
256
  if field_answers is not None:
248
257
  return field_answers
@@ -252,16 +261,21 @@ class Generator(Generic[ContextType, InputType]):
252
261
  matches = regex.findall(completion_text)
253
262
  if len(matches) > 0:
254
263
  answer_text = matches[0].strip()
264
+
265
+ # if we don't expect a JSON output, return the answer text as is
266
+ if not json_output:
267
+ return answer_text
268
+
269
+ # otherwise, try to parse the answer text into a JSON object
255
270
  field_answers = self._check_bool_answer_text(answer_text)
256
271
  if field_answers is not None:
257
272
  return field_answers
258
273
 
259
- # finally, try taking all of the text; throw an exception if this doesn't work
260
- field_answers = self._check_bool_answer_text(completion_text)
261
- if field_answers is None:
262
- raise Exception(f"Could not parse answer from completion text: {completion_text}")
274
+ # finally, try taking all of the text; for JSON output, throw an exception if parsing fails
275
+ if not json_output:
276
+ return completion_text
263
277
 
264
- return field_answers
278
+ return self._check_bool_answer_text(completion_text, throw_exception=True)
265
279
 
266
280
  def _parse_answer(self, completion_text: str, fields: dict[str, FieldInfo] | None, json_output: bool, **kwargs) -> dict[str, list]:
267
281
  """Extract the answer from the completion object."""
@@ -275,8 +289,8 @@ class Generator(Generic[ContextType, InputType]):
275
289
 
276
290
  # extract the per-field answers from the completion text
277
291
  field_answers = (
278
- self._parse_bool_answer(completion_text)
279
- if self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()
292
+ self._parse_bool_answer(completion_text, json_output)
293
+ if self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()
280
294
  else self._parse_convert_answer(completion_text, fields, json_output)
281
295
  )
282
296
 
@@ -299,6 +313,7 @@ class Generator(Generic[ContextType, InputType]):
299
313
 
300
314
  # generate a list of messages which can be used to construct a payload
301
315
  messages = self.prompt_factory.create_messages(candidate, fields, right_candidate, **kwargs)
316
+ is_audio_op = any(msg.get("type") == "input_audio" for msg in messages)
302
317
 
303
318
  # generate the text completion
304
319
  start_time = time.time()
@@ -307,7 +322,7 @@ class Generator(Generic[ContextType, InputType]):
307
322
  completion_kwargs = {}
308
323
  if not self.model.is_o_model() and not self.model.is_gpt_5_model():
309
324
  completion_kwargs = {"temperature": kwargs.get("temperature", 0.0), **completion_kwargs}
310
- if self.prompt_strategy.is_audio_prompt():
325
+ if is_audio_op:
311
326
  completion_kwargs = {"modalities": ["text"], **completion_kwargs}
312
327
  if self.model.is_reasoning_model():
313
328
  if self.model.is_vertex_model():
@@ -330,11 +345,10 @@ class Generator(Generic[ContextType, InputType]):
330
345
  # if there's an error generating the completion, we have to return an empty answer
331
346
  # and can only account for the time spent performing the failed generation
332
347
  except Exception as e:
333
- print(f"Error generating completion: {e}")
334
348
  logger.error(f"Error generating completion: {e}")
335
349
  field_answers = (
336
350
  {"passed_operator": False}
337
- if self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()
351
+ if self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()
338
352
  else {field_name: None for field_name in fields}
339
353
  )
340
354
  reasoning = None
@@ -360,7 +374,7 @@ class Generator(Generic[ContextType, InputType]):
360
374
  # for now, we only use tokens from prompt_token_details if it's an audio prompt
361
375
  # get output tokens (all text) and input tokens by modality
362
376
  output_tokens = usage["completion_tokens"]
363
- if self.prompt_strategy.is_audio_prompt():
377
+ if is_audio_op:
364
378
  input_audio_tokens = usage["prompt_tokens_details"].get("audio_tokens", 0)
365
379
  input_text_tokens = usage["prompt_tokens_details"].get("text_tokens", 0)
366
380
  input_image_tokens = 0
@@ -413,9 +427,9 @@ class Generator(Generic[ContextType, InputType]):
413
427
 
414
428
  # parse field answers
415
429
  field_answers = None
416
- if fields is not None and (self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()):
430
+ if fields is not None and (self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()):
417
431
  field_answers = {"passed_operator": False}
418
- elif fields is not None and not (self.prompt_strategy.is_bool_prompt() or self.prompt_strategy.is_join_prompt()):
432
+ elif fields is not None and not (self.prompt_strategy.is_filter_prompt() or self.prompt_strategy.is_join_prompt()):
419
433
  field_answers = {field_name: None for field_name in fields}
420
434
  try:
421
435
  field_answers = self._parse_answer(completion_text, fields, json_output, **kwargs)
@@ -6,6 +6,8 @@ from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
6
6
  from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
7
7
  from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
8
8
  from palimpzest.query.operators.convert import NonLLMConvert as _NonLLMConvert
9
+ from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert as _CritiqueAndRefineConvert
10
+ from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineFilter as _CritiqueAndRefineFilter
9
11
  from palimpzest.query.operators.distinct import DistinctOp as _DistinctOp
10
12
  from palimpzest.query.operators.filter import FilterOp as _FilterOp
11
13
  from palimpzest.query.operators.filter import LLMFilter as _LLMFilter
@@ -46,12 +48,17 @@ from palimpzest.query.operators.logical import (
46
48
  from palimpzest.query.operators.logical import (
47
49
  RetrieveScan as _RetrieveScan,
48
50
  )
49
- from palimpzest.query.operators.mixture_of_agents_convert import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
51
+ from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
52
+ from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsFilter as _MixtureOfAgentsFilter
50
53
  from palimpzest.query.operators.physical import PhysicalOperator as _PhysicalOperator
51
54
  from palimpzest.query.operators.project import ProjectOp as _ProjectOp
55
+ from palimpzest.query.operators.rag import RAGConvert as _RAGConvert
56
+ from palimpzest.query.operators.rag import RAGFilter as _RAGFilter
52
57
  from palimpzest.query.operators.retrieve import RetrieveOp as _RetrieveOp
53
58
  from palimpzest.query.operators.scan import MarshalAndScanDataOp as _MarshalAndScanDataOp
54
59
  from palimpzest.query.operators.scan import ScanPhysicalOp as _ScanPhysicalOp
60
+ from palimpzest.query.operators.split import SplitConvert as _SplitConvert
61
+ from palimpzest.query.operators.split import SplitFilter as _SplitFilter
55
62
 
56
63
  LOGICAL_OPERATORS = [
57
64
  _LogicalOperator,
@@ -72,6 +79,8 @@ PHYSICAL_OPERATORS = (
72
79
  [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp]
73
80
  # convert
74
81
  + [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
82
+ # critique and refine
83
+ + [_CritiqueAndRefineConvert, _CritiqueAndRefineFilter]
75
84
  # distinct
76
85
  + [_DistinctOp]
77
86
  # scan
@@ -83,13 +92,17 @@ PHYSICAL_OPERATORS = (
83
92
  # limit
84
93
  + [_LimitScanOp]
85
94
  # mixture-of-agents
86
- + [_MixtureOfAgentsConvert]
95
+ + [_MixtureOfAgentsConvert, _MixtureOfAgentsFilter]
87
96
  # physical
88
97
  + [_PhysicalOperator]
89
98
  # project
90
99
  + [_ProjectOp]
100
+ # rag
101
+ + [_RAGConvert, _RAGFilter]
91
102
  # retrieve
92
103
  + [_RetrieveOp]
104
+ # split
105
+ + [_SplitConvert, _SplitFilter]
93
106
  )
94
107
 
95
108
  __all__ = [