palimpzest 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +38 -62
- palimpzest/core/data/dataset.py +1 -1
- palimpzest/core/data/iter_dataset.py +5 -5
- palimpzest/core/elements/groupbysig.py +1 -1
- palimpzest/core/elements/records.py +91 -109
- palimpzest/core/lib/schemas.py +23 -0
- palimpzest/core/models.py +3 -3
- palimpzest/prompts/__init__.py +2 -6
- palimpzest/prompts/convert_prompts.py +10 -66
- palimpzest/prompts/critique_and_refine_prompts.py +66 -0
- palimpzest/prompts/filter_prompts.py +8 -46
- palimpzest/prompts/join_prompts.py +12 -75
- palimpzest/prompts/{moa_aggregator_convert_prompts.py → moa_aggregator_prompts.py} +51 -2
- palimpzest/prompts/moa_proposer_prompts.py +87 -0
- palimpzest/prompts/prompt_factory.py +351 -479
- palimpzest/prompts/split_merge_prompts.py +51 -2
- palimpzest/prompts/split_proposer_prompts.py +48 -16
- palimpzest/prompts/utils.py +109 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- palimpzest/query/execution/execution_strategy.py +4 -4
- palimpzest/query/execution/mab_execution_strategy.py +47 -23
- palimpzest/query/execution/parallel_execution_strategy.py +3 -3
- palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
- palimpzest/query/generators/generators.py +31 -17
- palimpzest/query/operators/__init__.py +15 -2
- palimpzest/query/operators/aggregate.py +21 -19
- palimpzest/query/operators/compute.py +6 -8
- palimpzest/query/operators/convert.py +12 -37
- palimpzest/query/operators/critique_and_refine.py +194 -0
- palimpzest/query/operators/distinct.py +7 -7
- palimpzest/query/operators/filter.py +13 -25
- palimpzest/query/operators/join.py +321 -192
- palimpzest/query/operators/limit.py +4 -4
- palimpzest/query/operators/mixture_of_agents.py +246 -0
- palimpzest/query/operators/physical.py +25 -2
- palimpzest/query/operators/project.py +4 -4
- palimpzest/query/operators/{rag_convert.py → rag.py} +202 -5
- palimpzest/query/operators/retrieve.py +10 -9
- palimpzest/query/operators/scan.py +9 -10
- palimpzest/query/operators/search.py +18 -24
- palimpzest/query/operators/split.py +321 -0
- palimpzest/query/optimizer/__init__.py +12 -8
- palimpzest/query/optimizer/optimizer.py +12 -10
- palimpzest/query/optimizer/rules.py +201 -108
- palimpzest/query/optimizer/tasks.py +18 -6
- palimpzest/query/processor/config.py +2 -2
- palimpzest/query/processor/query_processor.py +2 -2
- palimpzest/query/processor/query_processor_factory.py +9 -5
- palimpzest/validator/validator.py +7 -9
- {palimpzest-0.8.1.dist-info → palimpzest-0.8.3.dist-info}/METADATA +3 -8
- palimpzest-0.8.3.dist-info/RECORD +95 -0
- palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
- palimpzest/prompts/util_phrases.py +0 -19
- palimpzest/query/operators/critique_and_refine_convert.py +0 -113
- palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
- palimpzest/query/operators/split_convert.py +0 -170
- palimpzest-0.8.1.dist-info/RECORD +0 -95
- {palimpzest-0.8.1.dist-info → palimpzest-0.8.3.dist-info}/WHEEL +0 -0
- {palimpzest-0.8.1.dist-info → palimpzest-0.8.3.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.8.1.dist-info → palimpzest-0.8.3.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""This file contains prompts for SplitConvert aggregator operations."""
|
|
2
2
|
|
|
3
3
|
### SYSTEM PROMPTS ###
|
|
4
|
-
|
|
4
|
+
MAP_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
|
|
5
5
|
You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
|
|
6
6
|
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
7
7
|
|
|
@@ -34,8 +34,38 @@ ANSWER:
|
|
|
34
34
|
---
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
+
FILTER_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
|
|
38
|
+
You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
|
|
39
|
+
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
40
|
+
|
|
41
|
+
You will be provided with a description of each input field and the filter condition.
|
|
42
|
+
|
|
43
|
+
Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
|
|
44
|
+
|
|
45
|
+
An example is shown below:
|
|
46
|
+
---
|
|
47
|
+
CHUNK 1 OUTPUT: The context describes Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, who is widely recognized as a foundational figure in computer science. Therefore, the answer is TRUE.
|
|
48
|
+
|
|
49
|
+
CHUNK 2 OUTPUT: Based on the context provided, Ada Lovelace is indeed a foundational computer scientist, therefore the answer is TRUE.
|
|
50
|
+
|
|
51
|
+
INPUT FIELDS:
|
|
52
|
+
- text: a text passage describing a scientist
|
|
53
|
+
- birthday: the scientist's birthday
|
|
54
|
+
- image: an image of the scientist
|
|
55
|
+
- recording: an audio recording of a newscast about the scientist's contributions to their field
|
|
56
|
+
|
|
57
|
+
FILTER CONDITION: The subject of the input is a foundational computer scientist.
|
|
58
|
+
|
|
59
|
+
Let's think step-by-step in order to answer the question.
|
|
60
|
+
|
|
61
|
+
REASONING: Looking at both chunk outputs, they agree that the subject is a foundational computer scientist. Both outputs provide consistent evidence supporting this conclusion.
|
|
62
|
+
|
|
63
|
+
ANSWER: TRUE
|
|
64
|
+
---
|
|
65
|
+
"""
|
|
66
|
+
|
|
37
67
|
### USER / INSTANCE-SPECIFIC PROMPTS ###
|
|
38
|
-
|
|
68
|
+
MAP_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
|
|
39
69
|
You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
|
|
40
70
|
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
41
71
|
|
|
@@ -54,3 +84,22 @@ OUTPUT FIELDS:
|
|
|
54
84
|
Let's think step-by-step in order to answer the question.
|
|
55
85
|
|
|
56
86
|
REASONING: """
|
|
87
|
+
|
|
88
|
+
FILTER_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to answer a TRUE/FALSE question.
|
|
89
|
+
You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single TRUE/FALSE answer.
|
|
90
|
+
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
91
|
+
|
|
92
|
+
You will be provided with a description of each input field and the filter condition.
|
|
93
|
+
|
|
94
|
+
Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
|
|
95
|
+
---
|
|
96
|
+
{chunk_outputs}
|
|
97
|
+
|
|
98
|
+
INPUT FIELDS:
|
|
99
|
+
{input_fields_desc}
|
|
100
|
+
|
|
101
|
+
FILTER CONDITION: {filter_condition}
|
|
102
|
+
|
|
103
|
+
Let's think step-by-step in order to answer the question.
|
|
104
|
+
|
|
105
|
+
REASONING: """
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
"""This file contains prompts for
|
|
1
|
+
"""This file contains prompts for SplitAndMerge operations."""
|
|
2
2
|
|
|
3
|
-
###
|
|
4
|
-
|
|
5
|
-
You will be presented with a context and a set of output fields to generate. Your task is to generate a
|
|
3
|
+
### SYSTEM PROMPTS ###
|
|
4
|
+
MAP_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
5
|
+
You will be presented with a context and a set of output fields to generate. Your task is to generate a detailed and succinct analysis describing what you believe is the correct value for each output field.
|
|
6
6
|
Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
|
|
7
7
|
|
|
8
8
|
You will be provided with a description of each input field and each output field.
|
|
@@ -16,7 +16,7 @@ OUTPUT FIELDS:
|
|
|
16
16
|
{example_output_fields}
|
|
17
17
|
|
|
18
18
|
CONTEXT:
|
|
19
|
-
{example_context}
|
|
19
|
+
{{{example_context}}}{image_disclaimer}{audio_disclaimer}
|
|
20
20
|
|
|
21
21
|
Let's think step-by-step in order to answer the question.
|
|
22
22
|
|
|
@@ -24,7 +24,30 @@ ANSWER: {example_answer}
|
|
|
24
24
|
---
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
FILTER_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
28
|
+
You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
|
|
29
|
+
Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
|
|
30
|
+
|
|
31
|
+
You will be provided with a description of each input field.
|
|
32
|
+
|
|
33
|
+
An example is shown below:
|
|
34
|
+
---
|
|
35
|
+
INPUT FIELDS:
|
|
36
|
+
{example_input_fields}
|
|
37
|
+
|
|
38
|
+
CONTEXT:
|
|
39
|
+
{{{example_context}}}{image_disclaimer}{audio_disclaimer}
|
|
40
|
+
|
|
41
|
+
FILTER CONDITION: {example_filter_condition}
|
|
42
|
+
|
|
43
|
+
Let's think step-by-step in order to answer the question.
|
|
44
|
+
|
|
45
|
+
ANSWER: {example_answer}
|
|
46
|
+
---
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
### USER / INSTANCE-SPECIFIC PROMPTS ###
|
|
50
|
+
MAP_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
28
51
|
You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
|
|
29
52
|
Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
|
|
30
53
|
{desc_section}
|
|
@@ -37,19 +60,28 @@ OUTPUT FIELDS:
|
|
|
37
60
|
{output_fields_desc}
|
|
38
61
|
|
|
39
62
|
CONTEXT:
|
|
40
|
-
{context}
|
|
63
|
+
{context}<<image-audio-placeholder>>
|
|
41
64
|
|
|
42
65
|
Let's think step-by-step in order to answer the question.
|
|
43
66
|
|
|
44
67
|
ANSWER: """
|
|
45
68
|
|
|
69
|
+
FILTER_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
70
|
+
You will be presented with a context and a filter condition. Your task is to generate a detailed and succinct analysis describing whether you believe the input satisfies the filter condition.
|
|
71
|
+
Be sure to cite information from the context as evidence of why your determination is correct. Do not hallucinate evidence.
|
|
72
|
+
{desc_section}
|
|
73
|
+
You will be provided with a description of each input field.
|
|
74
|
+
|
|
75
|
+
An example is shown below:
|
|
76
|
+
---
|
|
77
|
+
INPUT FIELDS:
|
|
78
|
+
{input_fields_desc}
|
|
79
|
+
|
|
80
|
+
CONTEXT:
|
|
81
|
+
{context}<<image-audio-placeholder>>
|
|
82
|
+
|
|
83
|
+
FILTER CONDITION: {filter_condition}
|
|
46
84
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS = """- name: the list of names for each scientist mentioned in the text
|
|
51
|
-
- field_of_study: a list with the field of study for each scientist"""
|
|
52
|
-
SPLIT_PROPOSER_EXAMPLE_CONTEXT = """{{
|
|
53
|
-
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, born December 10, 1815 was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation."
|
|
54
|
-
}}"""
|
|
55
|
-
SPLIT_PROPOSER_EXAMPLE_ANSWER = """the text passage mentions the scientists "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and "Charles Babbage", both of whom were mathematicians. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"]."""
|
|
85
|
+
Let's think step-by-step in order to answer the question.
|
|
86
|
+
|
|
87
|
+
ANSWER: """
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""This file contains utility format strings which are templated into many of our prompts."""
|
|
2
|
+
|
|
3
|
+
### FORMATTING INSTRUCTIONS ###
|
|
4
|
+
ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
|
|
5
|
+
ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
|
|
6
|
+
|
|
7
|
+
### USER-PROVIDED DESCRIPTION FOR MAPS / FILTERS / JOINS ###
|
|
8
|
+
DESC_SECTION = """
|
|
9
|
+
The user has additionally provided you with this description of the task you need to perform:
|
|
10
|
+
{desc}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
### JOB INSTRUCTIONS ###
|
|
14
|
+
MAP_JOB_INSTRUCTION = """analyze input {modalities} in order to produce a JSON object"""
|
|
15
|
+
FILTER_JOB_INSTRUCTION = """analyze input {modalities} in order to answer a TRUE / FALSE question"""
|
|
16
|
+
JOIN_JOB_INSTRUCTION = """analyze input {modalities} in order to determine whether two data records satisfy a join condition"""
|
|
17
|
+
PROPOSER_JOB_INSTRUCTION = """analyze input {modalities} in order to produce an answer to a question"""
|
|
18
|
+
|
|
19
|
+
### FILTER / JOIN CONDITIONS ###
|
|
20
|
+
EXAMPLE_FILTER_CONDITION = "The subject of the input is a foundational computer scientist."
|
|
21
|
+
EXAMPLE_JOIN_CONDITION = "The two inputs are scientists in the same academic field."
|
|
22
|
+
|
|
23
|
+
### EXAMPLE INPUT FIELDS ###
|
|
24
|
+
TEXT_EXAMPLE_INPUT_FIELDS = """
|
|
25
|
+
- text: a text passage describing a scientist
|
|
26
|
+
- birthday: the scientist's birthday
|
|
27
|
+
"""
|
|
28
|
+
IMAGE_EXAMPLE_INPUT_FIELDS = """
|
|
29
|
+
- image: an image of the scientist
|
|
30
|
+
- photographer: the photographer of the image
|
|
31
|
+
"""
|
|
32
|
+
AUDIO_EXAMPLE_INPUT_FIELDS = """
|
|
33
|
+
- recording: an audio recording of a newscast about the scientist's contributions to their field
|
|
34
|
+
- speaker: the speaker in the recording
|
|
35
|
+
"""
|
|
36
|
+
RIGHT_TEXT_EXAMPLE_INPUT_FIELDS = """
|
|
37
|
+
- contents: the contents of a text file
|
|
38
|
+
"""
|
|
39
|
+
RIGHT_IMAGE_EXAMPLE_INPUT_FIELDS = """
|
|
40
|
+
- headshot: a headshot of a famous scientist
|
|
41
|
+
"""
|
|
42
|
+
RIGHT_AUDIO_EXAMPLE_INPUT_FIELDS = """
|
|
43
|
+
- podcast: an audio recording of a podcast about historic scientists
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
### EXAMPLE OUTPUT FIELDS ###
|
|
47
|
+
TEXT_EXAMPLE_OUTPUT_FIELDS = """- name: the name of the scientist
|
|
48
|
+
- birth_year: the year the scientist was born"""
|
|
49
|
+
IMAGE_EXAMPLE_OUTPUT_FIELDS = """- is_bald: true if the scientist is bald and false otherwise"""
|
|
50
|
+
AUDIO_EXAMPLE_OUTPUT_FIELDS = """- birthplace: the city where the scientist was born"""
|
|
51
|
+
|
|
52
|
+
### EXAMPLE CONTEXTS ###
|
|
53
|
+
TEXT_EXAMPLE_CONTEXT = """
|
|
54
|
+
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
|
|
55
|
+
"birthday": "December 10, 1815"
|
|
56
|
+
"""
|
|
57
|
+
IMAGE_EXAMPLE_CONTEXT = """
|
|
58
|
+
"image": <bytes>,
|
|
59
|
+
"photographer": "CameraEnthusiast1"
|
|
60
|
+
"""
|
|
61
|
+
AUDIO_EXAMPLE_CONTEXT = """
|
|
62
|
+
"recording": <bytes>,
|
|
63
|
+
"speaker": "Walter Cronkite"
|
|
64
|
+
"""
|
|
65
|
+
RIGHT_TEXT_EXAMPLE_CONTEXT = """
|
|
66
|
+
"content": "Alan Turing was a pioneering computer scientist and mathematician. He is widely considered to be the father of theoretical computer science and artificial intelligence."
|
|
67
|
+
"""
|
|
68
|
+
RIGHT_IMAGE_EXAMPLE_CONTEXT = """
|
|
69
|
+
"headshot": <bytes>
|
|
70
|
+
"""
|
|
71
|
+
RIGHT_AUDIO_EXAMPLE_CONTEXT = """
|
|
72
|
+
"podcast": <bytes>
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
### DISCLAIMERS ###
|
|
76
|
+
IMAGE_DISCLAIMER = """
|
|
77
|
+
\n<image content provided here; assume in this example the image shows Ada Lovelace wearing a hat on top of her hair>
|
|
78
|
+
"""
|
|
79
|
+
AUDIO_DISCLAIMER = """
|
|
80
|
+
\n<audio content provided here; assume in this example the recording is about Ada Lovelace's upbringing in London>
|
|
81
|
+
"""
|
|
82
|
+
RIGHT_IMAGE_DISCLAIMER = """
|
|
83
|
+
\n<image content provided here; assume in this example the image shows Alan Turing working at his desk>
|
|
84
|
+
"""
|
|
85
|
+
RIGHT_AUDIO_DISCLAIMER = """
|
|
86
|
+
\n<audio content provided here; assume in this example the podcast is discussing Alan Turing's work on the Enigma code>
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
### EXAMPLE REASONINGS ###
|
|
90
|
+
TEXT_EXAMPLE_REASONING = """The text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
|
|
91
|
+
IMAGE_EXAMPLE_REASONING = """The image shows hair on top of the scientist's head, so the is_bald field should be false."""
|
|
92
|
+
AUDIO_EXAMPLE_REASONING = """The newscast recording discusses Ada Lovelace's upbringing in London, so the birthplace field should be "London"."""
|
|
93
|
+
FILTER_EXAMPLE_REASONING = """Ada Lovelace is a foundational computer scientist, therefore the answer is TRUE."""
|
|
94
|
+
JOIN_EXAMPLE_REASONING = """The subject of the left record is Ada Lovelace and the subject of the right record is Alan Turing. Since both inputs are about computer scientists, they satisfy the join condition. Therefore, the answer is TRUE."""
|
|
95
|
+
|
|
96
|
+
### EXAMPLE ANSWERS ###
|
|
97
|
+
TEXT_EXAMPLE_ANSWER = """
|
|
98
|
+
"name": "Augusta Ada King",
|
|
99
|
+
"birth_year": 1815
|
|
100
|
+
"""
|
|
101
|
+
IMAGE_EXAMPLE_ANSWER = """
|
|
102
|
+
"is_bald": false,
|
|
103
|
+
"""
|
|
104
|
+
AUDIO_EXAMPLE_ANSWER = """
|
|
105
|
+
"birthplace": "London",
|
|
106
|
+
"""
|
|
107
|
+
TEXT_SENTENCE_EXAMPLE_ANSWER = """the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815."""
|
|
108
|
+
IMAGE_SENTENCE_EXAMPLE_ANSWER = """The image shows hair on top of the woman's head, so the is_bald field should be false."""
|
|
109
|
+
AUDIO_SENTENCE_EXAMPLE_ANSWER = """The newscast recording discusses Ada Lovelace's upbringing in London, so her birthplace is "London"."""
|
|
@@ -146,7 +146,7 @@ class OpSet:
|
|
|
146
146
|
input = []
|
|
147
147
|
max_quality_record_set = self.pick_highest_quality_output(record_sets)
|
|
148
148
|
for record in max_quality_record_set:
|
|
149
|
-
input.append(record if record.
|
|
149
|
+
input.append(record if record._passed_operator else None)
|
|
150
150
|
|
|
151
151
|
self.source_indices_to_inputs[source_idx] = input
|
|
152
152
|
|
|
@@ -182,7 +182,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
182
182
|
elif isinstance(op, LLMFilter):
|
|
183
183
|
filter_str = op.filter_obj.filter_condition
|
|
184
184
|
input_record: DataRecord = record_set.input
|
|
185
|
-
output = record_set.data_records[0].
|
|
185
|
+
output = record_set.data_records[0]._passed_operator
|
|
186
186
|
full_hash = f"{filter_str}{hash(input_record)}"
|
|
187
187
|
if full_hash not in full_hashes:
|
|
188
188
|
full_hash_to_bool_output[full_hash] = output
|
|
@@ -195,7 +195,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
195
195
|
for left_idx, left_input_record in enumerate(record_set.input[0]):
|
|
196
196
|
for right_idx, right_input_record in enumerate(record_set.input[1]):
|
|
197
197
|
record_idx = left_idx * len(record_set.input[1]) + right_idx
|
|
198
|
-
output = record_set.data_records[record_idx].
|
|
198
|
+
output = record_set.data_records[record_idx]._passed_operator
|
|
199
199
|
full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
|
|
200
200
|
if full_hash not in full_hashes:
|
|
201
201
|
full_hash_to_bool_output[full_hash] = output
|
|
@@ -246,7 +246,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
246
246
|
elif isinstance(op, LLMFilter):
|
|
247
247
|
filter_str = op.filter_obj.filter_condition
|
|
248
248
|
input_record: DataRecord = record_set.input
|
|
249
|
-
output = record_set.data_records[0].
|
|
249
|
+
output = record_set.data_records[0]._passed_operator
|
|
250
250
|
full_hash = f"{filter_str}{hash(input_record)}"
|
|
251
251
|
if output == full_hash_to_bool_output[full_hash]:
|
|
252
252
|
record_set.record_op_stats[0].quality = full_hash_to_score[full_hash]
|
|
@@ -258,7 +258,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
258
258
|
for left_idx, left_input_record in enumerate(record_set.input[0]):
|
|
259
259
|
for right_idx, right_input_record in enumerate(record_set.input[1]):
|
|
260
260
|
record_idx = left_idx * len(record_set.input[1]) + right_idx
|
|
261
|
-
output = record_set.data_records[record_idx].
|
|
261
|
+
output = record_set.data_records[record_idx]._passed_operator
|
|
262
262
|
full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
|
|
263
263
|
if output == full_hash_to_bool_output[full_hash]:
|
|
264
264
|
record_set.record_op_stats[record_idx].quality = full_hash_to_score[full_hash]
|
|
@@ -2,16 +2,19 @@
|
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
|
+
from chromadb.api.models.Collection import Collection
|
|
5
6
|
|
|
6
7
|
from palimpzest.core.data.dataset import Dataset
|
|
7
8
|
from palimpzest.core.elements.records import DataRecord, DataRecordSet
|
|
8
|
-
from palimpzest.core.models import OperatorStats, RecordOpStats, SentinelPlanStats
|
|
9
|
+
from palimpzest.core.models import OperatorCostEstimates, OperatorStats, RecordOpStats, SentinelPlanStats
|
|
9
10
|
from palimpzest.policy import Policy
|
|
10
11
|
from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
|
|
11
12
|
from palimpzest.query.operators.aggregate import AggregateOp
|
|
12
|
-
from palimpzest.query.operators.
|
|
13
|
+
from palimpzest.query.operators.convert import LLMConvert
|
|
14
|
+
from palimpzest.query.operators.filter import FilterOp, LLMFilter
|
|
13
15
|
from palimpzest.query.operators.join import JoinOp
|
|
14
16
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
17
|
+
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
15
18
|
from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
|
|
16
19
|
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
17
20
|
from palimpzest.utils.progress import create_progress_manager
|
|
@@ -55,6 +58,17 @@ class OpFrontier:
|
|
|
55
58
|
# store the prior beliefs on operator performance (if provided)
|
|
56
59
|
self.priors = priors
|
|
57
60
|
|
|
61
|
+
# boolean indication of the type of operator in this OpFrontier
|
|
62
|
+
sample_op = op_set[0]
|
|
63
|
+
self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
|
|
64
|
+
self.is_filter_op = isinstance(sample_op, FilterOp)
|
|
65
|
+
self.is_aggregate_op = isinstance(sample_op, AggregateOp)
|
|
66
|
+
self.is_llm_join = isinstance(sample_op, JoinOp)
|
|
67
|
+
is_llm_convert = isinstance(sample_op, LLMConvert)
|
|
68
|
+
is_llm_filter = isinstance(sample_op, LLMFilter)
|
|
69
|
+
is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
|
|
70
|
+
self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
|
|
71
|
+
|
|
58
72
|
# get order in which we will sample physical operators for this logical operator
|
|
59
73
|
sample_op_indices = self._get_op_index_order(op_set, seed)
|
|
60
74
|
|
|
@@ -68,13 +82,6 @@ class OpFrontier:
|
|
|
68
82
|
self.full_op_id_to_sources_not_processed = {op.get_full_op_id(): source_indices for op in op_set}
|
|
69
83
|
self.max_inputs = len(source_indices)
|
|
70
84
|
|
|
71
|
-
# boolean indication of the type of operator in this OpFrontier
|
|
72
|
-
sample_op = op_set[0]
|
|
73
|
-
self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
|
|
74
|
-
self.is_filter_op = isinstance(sample_op, FilterOp)
|
|
75
|
-
self.is_aggregate_op = isinstance(sample_op, AggregateOp)
|
|
76
|
-
self.is_llm_join = isinstance(sample_op, JoinOp)
|
|
77
|
-
|
|
78
85
|
# set the initial inputs for this logical operator; we maintain a mapping from source_unique_logical_op_id --> source_indices --> input;
|
|
79
86
|
# for each unique source and (tuple of) source indices, we store its output, which is an input to this operator
|
|
80
87
|
# for scan operators, we use the default name "source" since these operators have no source
|
|
@@ -149,16 +156,44 @@ class OpFrontier:
|
|
|
149
156
|
|
|
150
157
|
return op_id_to_pareto_distance
|
|
151
158
|
|
|
159
|
+
def _compute_naive_priors(self, op_set: list[PhysicalOperator]) -> dict[str, dict[str, float]]:
|
|
160
|
+
naive_priors = {}
|
|
161
|
+
for op in op_set:
|
|
162
|
+
# use naive cost estimates with dummy source estimates to compute priors
|
|
163
|
+
source_op_estimates = OperatorCostEstimates(quality=1.0, cost_per_record=0.0, time_per_record=0.0, cardinality=100)
|
|
164
|
+
op_estimates = (
|
|
165
|
+
op.naive_cost_estimates(source_op_estimates, source_op_estimates)
|
|
166
|
+
if self.is_llm_join
|
|
167
|
+
else op.naive_cost_estimates(source_op_estimates)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# get op_id for this operator
|
|
171
|
+
op_id = op.get_op_id()
|
|
172
|
+
|
|
173
|
+
# set the naive quality, cost, and time priors for this operator
|
|
174
|
+
naive_priors[op_id] = {
|
|
175
|
+
"quality": op_estimates.quality,
|
|
176
|
+
"cost": op_estimates.cost_per_record,
|
|
177
|
+
"time": op_estimates.time_per_record,
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return naive_priors
|
|
181
|
+
|
|
152
182
|
def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
|
|
153
183
|
"""
|
|
154
184
|
Returns a list of indices for the operators in the op_set.
|
|
155
185
|
"""
|
|
156
|
-
if
|
|
186
|
+
# if this is not an llm-operator, we simply return the indices in random order
|
|
187
|
+
if not self.is_llm_op:
|
|
157
188
|
rng = np.random.default_rng(seed=seed)
|
|
158
189
|
op_indices = np.arange(len(op_set))
|
|
159
190
|
rng.shuffle(op_indices)
|
|
160
191
|
return op_indices
|
|
161
192
|
|
|
193
|
+
# if this is an llm-operator, but we do not have priors, we first compute naive priors
|
|
194
|
+
if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
|
|
195
|
+
self.priors = self._compute_naive_priors(op_set)
|
|
196
|
+
|
|
162
197
|
# NOTE: self.priors is a dictionary with format:
|
|
163
198
|
# {op_id: {"quality": quality, "cost": cost, "time": time}}
|
|
164
199
|
|
|
@@ -215,7 +250,7 @@ class OpFrontier:
|
|
|
215
250
|
op_source_indices_pairs = []
|
|
216
251
|
|
|
217
252
|
# if this operator is not being optimized: we don't request inputs, but simply process what we are given / told to (in the case of scans)
|
|
218
|
-
if not self.
|
|
253
|
+
if not self.is_llm_op and len(self.frontier_ops) == 1:
|
|
219
254
|
return [(self.frontier_ops[0], None)]
|
|
220
255
|
|
|
221
256
|
# otherwise, sample (operator, source_indices) pairs
|
|
@@ -255,16 +290,6 @@ class OpFrontier:
|
|
|
255
290
|
all_inputs.extend(inputs)
|
|
256
291
|
return [(op, tuple(), all_inputs)]
|
|
257
292
|
|
|
258
|
-
# if this is an un-optimized (non-scan, non-join) operator, flatten inputs and run on each one
|
|
259
|
-
elif not self.is_scan_op and not self.is_llm_join and len(self.frontier_ops) == 1:
|
|
260
|
-
op_inputs = []
|
|
261
|
-
op = self.frontier_ops[0]
|
|
262
|
-
for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
|
|
263
|
-
for source_indices, inputs in source_indices_to_inputs.items():
|
|
264
|
-
for input in inputs:
|
|
265
|
-
op_inputs.append((op, source_indices, input))
|
|
266
|
-
return op_inputs
|
|
267
|
-
|
|
268
293
|
### for optimized operators
|
|
269
294
|
# get the list of (op, source_indices) pairs which this operator needs to execute
|
|
270
295
|
op_source_indices_pairs = self._get_op_source_indices_pairs()
|
|
@@ -583,11 +608,10 @@ class OpFrontier:
|
|
|
583
608
|
input = []
|
|
584
609
|
max_quality_record_set = self.pick_highest_quality_output(record_sets)
|
|
585
610
|
for record in max_quality_record_set:
|
|
586
|
-
input.append(record if record.
|
|
611
|
+
input.append(record if record._passed_operator else None)
|
|
587
612
|
|
|
588
613
|
self.source_indices_to_inputs[source_unique_logical_op_id][source_indices] = input
|
|
589
614
|
|
|
590
|
-
|
|
591
615
|
class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
592
616
|
"""
|
|
593
617
|
This class implements the Multi-Armed Bandit (MAB) execution strategy for SentinelQueryProcessors.
|
|
@@ -61,8 +61,8 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
61
61
|
output = future.result()
|
|
62
62
|
record_set, num_inputs_processed = output if self.is_join_op[unique_full_op_id] else (output, 1)
|
|
63
63
|
|
|
64
|
-
# record set can be
|
|
65
|
-
if record_set
|
|
64
|
+
# record set can be empty if one side of join has no input records yet
|
|
65
|
+
if len(record_set) == 0:
|
|
66
66
|
continue
|
|
67
67
|
|
|
68
68
|
# otherwise, process records and their stats
|
|
@@ -77,7 +77,7 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
77
77
|
plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
|
|
78
78
|
|
|
79
79
|
# add records which aren't filtered to the output records
|
|
80
|
-
output_records.extend([record for record in records if record.
|
|
80
|
+
output_records.extend([record for record in records if record._passed_operator])
|
|
81
81
|
|
|
82
82
|
# update the progress manager
|
|
83
83
|
if total_inputs_processed > 0:
|
|
@@ -52,7 +52,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
52
52
|
record_set = operator(candidates=input_queues[unique_full_op_id][source_unique_full_op_id])
|
|
53
53
|
records = record_set.data_records
|
|
54
54
|
record_op_stats = record_set.record_op_stats
|
|
55
|
-
num_outputs = sum(record.
|
|
55
|
+
num_outputs = sum(record._passed_operator for record in records)
|
|
56
56
|
|
|
57
57
|
# update the progress manager
|
|
58
58
|
self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -70,7 +70,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
70
70
|
record_set, num_inputs_processed = operator(left_input_records, right_input_records)
|
|
71
71
|
records = record_set.data_records
|
|
72
72
|
record_op_stats = record_set.record_op_stats
|
|
73
|
-
num_outputs = sum(record.
|
|
73
|
+
num_outputs = sum(record._passed_operator for record in records)
|
|
74
74
|
|
|
75
75
|
# update the progress manager
|
|
76
76
|
self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -82,7 +82,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
82
82
|
record_set = operator(input_record)
|
|
83
83
|
records.extend(record_set.data_records)
|
|
84
84
|
record_op_stats.extend(record_set.record_op_stats)
|
|
85
|
-
num_outputs = sum(record.
|
|
85
|
+
num_outputs = sum(record._passed_operator for record in record_set.data_records)
|
|
86
86
|
|
|
87
87
|
# update the progress manager
|
|
88
88
|
self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -95,7 +95,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
95
95
|
plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
|
|
96
96
|
|
|
97
97
|
# update next input_queue (if it exists)
|
|
98
|
-
output_records = [record for record in records if record.
|
|
98
|
+
output_records = [record for record in records if record._passed_operator]
|
|
99
99
|
next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
|
|
100
100
|
if next_unique_full_op_id is not None:
|
|
101
101
|
input_queues[next_unique_full_op_id][unique_full_op_id] = output_records
|
|
@@ -207,7 +207,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
207
207
|
record_set = operator(candidates=input_records)
|
|
208
208
|
records = record_set.data_records
|
|
209
209
|
record_op_stats = record_set.record_op_stats
|
|
210
|
-
num_outputs = sum(record.
|
|
210
|
+
num_outputs = sum(record._passed_operator for record in records)
|
|
211
211
|
|
|
212
212
|
# update the progress manager
|
|
213
213
|
self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -225,7 +225,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
225
225
|
record_set, num_inputs_processed = operator(left_input_records, right_input_records)
|
|
226
226
|
records = record_set.data_records
|
|
227
227
|
record_op_stats = record_set.record_op_stats
|
|
228
|
-
num_outputs = sum(record.
|
|
228
|
+
num_outputs = sum(record._passed_operator for record in records)
|
|
229
229
|
|
|
230
230
|
# update the progress manager
|
|
231
231
|
self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -237,7 +237,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
237
237
|
record_set = operator(input_record)
|
|
238
238
|
records = record_set.data_records
|
|
239
239
|
record_op_stats = record_set.record_op_stats
|
|
240
|
-
num_outputs = sum(record.
|
|
240
|
+
num_outputs = sum(record._passed_operator for record in records)
|
|
241
241
|
|
|
242
242
|
# update the progress manager
|
|
243
243
|
self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
@@ -246,7 +246,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
246
246
|
plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
|
|
247
247
|
|
|
248
248
|
# update next input_queue or final_output_records
|
|
249
|
-
output_records = [record for record in records if record.
|
|
249
|
+
output_records = [record for record in records if record._passed_operator]
|
|
250
250
|
next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
|
|
251
251
|
if next_unique_full_op_id is not None:
|
|
252
252
|
input_queues[next_unique_full_op_id][unique_full_op_id].extend(output_records)
|