palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +343 -209
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +639 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +62 -6
  19. palimpzest/prompts/filter_prompts.py +51 -6
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
  22. palimpzest/prompts/prompt_factory.py +375 -47
  23. palimpzest/prompts/split_proposer_prompts.py +1 -1
  24. palimpzest/prompts/util_phrases.py +5 -0
  25. palimpzest/prompts/validator.py +239 -0
  26. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  27. palimpzest/query/execution/execution_strategy.py +210 -317
  28. palimpzest/query/execution/execution_strategy_type.py +5 -7
  29. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  30. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  31. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  32. palimpzest/query/generators/generators.py +160 -331
  33. palimpzest/query/operators/__init__.py +15 -5
  34. palimpzest/query/operators/aggregate.py +50 -33
  35. palimpzest/query/operators/compute.py +201 -0
  36. palimpzest/query/operators/convert.py +33 -19
  37. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  38. palimpzest/query/operators/distinct.py +62 -0
  39. palimpzest/query/operators/filter.py +26 -16
  40. palimpzest/query/operators/join.py +403 -0
  41. palimpzest/query/operators/limit.py +3 -3
  42. palimpzest/query/operators/logical.py +205 -77
  43. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  44. palimpzest/query/operators/physical.py +27 -21
  45. palimpzest/query/operators/project.py +3 -3
  46. palimpzest/query/operators/rag_convert.py +7 -7
  47. palimpzest/query/operators/retrieve.py +9 -9
  48. palimpzest/query/operators/scan.py +81 -42
  49. palimpzest/query/operators/search.py +524 -0
  50. palimpzest/query/operators/split_convert.py +10 -8
  51. palimpzest/query/optimizer/__init__.py +7 -9
  52. palimpzest/query/optimizer/cost_model.py +108 -441
  53. palimpzest/query/optimizer/optimizer.py +123 -181
  54. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  55. palimpzest/query/optimizer/plan.py +352 -67
  56. palimpzest/query/optimizer/primitives.py +43 -19
  57. palimpzest/query/optimizer/rules.py +484 -646
  58. palimpzest/query/optimizer/tasks.py +127 -58
  59. palimpzest/query/processor/config.py +42 -76
  60. palimpzest/query/processor/query_processor.py +73 -18
  61. palimpzest/query/processor/query_processor_factory.py +46 -38
  62. palimpzest/schemabuilder/schema_builder.py +15 -28
  63. palimpzest/utils/model_helpers.py +32 -77
  64. palimpzest/utils/progress.py +114 -102
  65. palimpzest/validator/__init__.py +0 -0
  66. palimpzest/validator/validator.py +306 -0
  67. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
  68. palimpzest-0.8.1.dist-info/RECORD +95 -0
  69. palimpzest/core/lib/fields.py +0 -141
  70. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  71. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  72. palimpzest/query/generators/api_client_factory.py +0 -30
  73. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  74. palimpzest/query/operators/map.py +0 -130
  75. palimpzest/query/processor/nosentinel_processor.py +0 -33
  76. palimpzest/query/processor/processing_strategy_type.py +0 -28
  77. palimpzest/query/processor/sentinel_processor.py +0 -88
  78. palimpzest/query/processor/streaming_processor.py +0 -149
  79. palimpzest/sets.py +0 -405
  80. palimpzest/utils/datareader_helpers.py +0 -61
  81. palimpzest/utils/demo_helpers.py +0 -75
  82. palimpzest/utils/field_helpers.py +0 -69
  83. palimpzest/utils/generation_helpers.py +0 -69
  84. palimpzest/utils/sandbox.py +0 -183
  85. palimpzest-0.7.21.dist-info/RECORD +0 -95
  86. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
  88. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
  89. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ ANSWER: {example_answer}
27
27
  COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
28
  You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
29
29
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
30
-
30
+ {desc_section}
31
31
  You will be provided with a description of each input field and each output field.
32
32
  ---
33
33
  INPUT FIELDS:
@@ -12,3 +12,8 @@ REASONING: """
12
12
  COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
13
13
 
14
14
  ANSWER: """
15
+
16
+ DESC_SECTION = """
17
+ The user has additionally provided you with this description of the task you need to perform:
18
+ {desc}
19
+ """
@@ -0,0 +1,239 @@
1
+ ### MAP ###
2
+ MAP_VALIDATOR_PROMPT = """You are an intelligent judge whose job is to evaluate how successfully an agent executed a given instruction.
3
+ You will be presented with the input(s) provided to the agent followed by the output produced by the agent.
4
+
5
+ Each output will be a dictionary. The keys will be **output fields** which were computed by the agent.
6
+
7
+ Your job will be to assign a score of 1.0 to every output field which was computed correctly, and a score of 0.0 to every output field which was computed incorrectly. If the output for a field is a list, you may give a score in between 0.0 and 1.0 representing the fraction of correct items in the list.
8
+
9
+ Here is an example evaluation:
10
+
11
+ INPUT MESSAGES:
12
+ ---------------
13
+ You are a helpful assistant whose job is to generate a JSON object. You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
14
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
15
+
16
+ INPUT FIELDS:
17
+ - text: a text passage describing a scientist
18
+ - birthday: the scientist's birthday
19
+
20
+ OUTPUT FIELDS:
21
+ - name: the name of the scientist
22
+ - birth_year: the year the scientist was born
23
+
24
+ CONTEXT:
25
+ {{
26
+ "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
27
+ "birthday": "December 10, 1815"
28
+ }}
29
+
30
+ OUTPUT:
31
+ --------
32
+ {{
33
+ "name": "Charles Babbage",
34
+ "birth_year": 1815
35
+ }}
36
+
37
+ EVALUATION: {"name": 0.0, "birth_year": 1.0}
38
+
39
+ Remember, be sure to output your evaluation as a dictionary where each value contains a 0.0 or 1.0 score for each output field (or a score within [0.0, 1.0] for list output fields).
40
+
41
+ INPUT MESSAGES:
42
+ ---------------
43
+
44
+ """
45
+
46
+ MAP_IMAGE_VALIDATOR_PROMPT = """You are an intelligent judge whose job is to evaluate how successfully an agent executed a given instruction.
47
+ You will be presented with the input(s) provided to the agent followed by the output produced by the agent.
48
+
49
+ Each output will be a dictionary. The keys will be **output fields** which were computed by the agent.
50
+
51
+ Your job will be to assign a score of 1.0 to every output field which was computed correctly, and a score of 0.0 to every output field which was computed incorrectly. If the output for a field is a list, you may give a score in between 0.0 and 1.0 representing the fraction of correct items in the list.
52
+
53
+ Here is an example evaluation:
54
+
55
+ INPUT MESSAGES:
56
+ ---------------
57
+ You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce a JSON object. You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
58
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
59
+
60
+ INPUT FIELDS:
61
+ - image: an image of a scene
62
+ - photographer: the photographer of the image
63
+
64
+ OUTPUT FIELDS:
65
+ - dog_in_image: true if a dog is in the image and false otherwise
66
+ - person_in_image: true if a person is in the image and false otherwise
67
+
68
+ CONTEXT:
69
+ {{
70
+ "image": <bytes>,
71
+ "photographer": "CameraEnthusiast1"
72
+ }}
73
+ <image content provided here; assume in this example the image shows a dog and a cat playing>
74
+
75
+ OUTPUT:
76
+ --------
77
+ {{
78
+ "dog_in_image": true,
79
+ "person_in_image": true
80
+ }}
81
+
82
+ EVALUATION: {"dog_in_image": 1.0, "person_in_image": 0.0}
83
+
84
+ Remember, be sure to output your evaluation as a dictionary where each value contains a 0.0 or 1.0 score for each output field (or a score within [0.0, 1.0] for list output fields).
85
+
86
+ INPUT MESSAGES:
87
+ ---------------
88
+
89
+ """
90
+
91
+
92
+ ### FLAT MAP ###
93
+ FLAT_MAP_VALIDATOR_PROMPT = """You are an intelligent judge whose job is to evaluate how successfully an agent executed a given instruction.
94
+ You will be presented with the input(s) provided to the agent followed by the output(s) produced by the agent.
95
+
96
+ Each output will be a list of dictionaries. The keys of each dictionary will be **output fields** which were computed by the agent.
97
+
98
+ Your job will be to assign a score of 1.0 to every output field which was computed correctly, and a score of 0.0 to every output field which was computed incorrectly. If the output for a field is a list, you may give a score in between 0.0 and 1.0 representing the fraction of correct items in the list.
99
+
100
+ Here is an example evaluation:
101
+
102
+ INPUT MESSAGES:
103
+ ---------------
104
+ You are a helpful assistant whose job is to generate a JSON object. You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
105
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
106
+
107
+ INPUT FIELDS:
108
+ - text: a text passage describing scientists
109
+ - birthdays: text containing birth dates
110
+
111
+ OUTPUT FIELDS:
112
+ - name: the name of the scientist
113
+ - birth_year: the year the scientist was born
114
+
115
+ CONTEXT:
116
+ {{
117
+ "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
118
+ "birthdays": "...Lovelace was born on December 10, 1815, almost exactly 24 years after Babbage's birth on 26 December 1791..."
119
+ }}
120
+
121
+ OUTPUTS:
122
+ --------
123
+ [
124
+ {{
125
+ "name": "Ada Lovelace",
126
+ "birth_year": 1815
127
+ }},
128
+ {{
129
+ "name": "Charles Babbage",
130
+ "birth_year": 1790
131
+ }}
132
+ ]
133
+
134
+ EVALUATION: [{"name": 1.0, "birth_year": 1.0}, {"name": 1.0, "birth_year": 0.0}]
135
+
136
+ Remember, be sure to output your evaluation as a list of dictionaries where each dictionary contains a 0.0 or 1.0 score for each output field (or a score within [0.0, 1.0] for list output fields).
137
+
138
+ INPUT MESSAGES:
139
+ ---------------
140
+
141
+ """
142
+
143
+ FLAT_MAP_IMAGE_VALIDATOR_PROMPT = """You are an intelligent judge whose job is to evaluate how successfully an agent executed a given instruction.
144
+ You will be presented with the input(s) provided to the agent followed by the output(s) produced by the agent.
145
+
146
+ Each output will be a list of dictionaries. The keys of each dictionary will be **output fields** which were computed by the agent.
147
+
148
+ Your job will be to assign a score of 1.0 to every output field which was computed correctly, and a score of 0.0 to every output field which was computed incorrectly. If the output for a field is a list, you may give a score in between 0.0 and 1.0 representing the fraction of correct items in the list.
149
+
150
+ Here is an example evaluation:
151
+
152
+ INPUT MESSAGES:
153
+ ---------------
154
+ You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce a JSON object. You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
155
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
156
+
157
+ INPUT FIELDS:
158
+ - image: an image of a scene
159
+ - photographer: the photographer of the image
160
+
161
+ OUTPUT FIELDS:
162
+ - animal: the type of animal in the image
163
+ - animal_is_canine: true if the animal is a canine and false otherwise
164
+
165
+ CONTEXT:
166
+ {{
167
+ "image": <bytes>,
168
+ "photographer": "CameraEnthusiast1"
169
+ }}
170
+ <image content provided here; assume in this example the image shows a dog and a cat playing>
171
+
172
+ OUTPUT:
173
+ --------
174
+ [
175
+ {{
176
+ "animal": "dog",
177
+ "animal_is_canine": true
178
+ }},
179
+ {{
180
+ "animal": "cat",
181
+ "animal_is_canine": true
182
+ }}
183
+ ]
184
+
185
+ EVALUATION: [{"animal": 1.0, "animal_is_canine": 1.0}, {"animal": 1.0, "animal_is_canine": 0.0}]
186
+
187
+ Remember, be sure to output your evaluation as a list of dictionaries where each dictionary contains a 0.0 or 1.0 score for each output field (or a score within [0.0, 1.0] for list output fields).
188
+
189
+ INPUT MESSAGES:
190
+ ---------------
191
+
192
+ """
193
+
194
+
195
+ ### RETRIEVE
196
+ RETRIEVE_VALIDATOR_PROMPT = """You are an intelligent judge whose job is to evaluate how successfully an agent executed a given instruction.
197
+ You will be presented with the input(s) provided to the agent followed by the output produced by the agent.
198
+
199
+ Each output will be a dictionary. The keys will be **output fields** which were computed by the agent.
200
+
201
+ Your job will be to assign a score of 1.0 to every output field which was computed correctly, and a score of 0.0 to every output field which was computed incorrectly. If the output for a field is a list, you may give a score in between 0.0 and 1.0 representing the fraction of correct items in the list.
202
+
203
+ Here is an example evaluation:
204
+
205
+ INPUT MESSAGES:
206
+ ---------------
207
+ You are a helpful assistant whose job is to generate a JSON object. You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
208
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
209
+
210
+ INPUT FIELDS:
211
+ - text: a text passage describing a scientist
212
+
213
+ OUTPUT FIELDS:
214
+ - related_scientists: list of scientists who perform similar work as the scientist described in the text
215
+
216
+ CONTEXT:
217
+ {{
218
+ "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
219
+ }}
220
+
221
+ OUTPUT:
222
+ --------
223
+ {{
224
+ "related_scientists": [
225
+ "Charles Babbage",
226
+ "Alan Turing",
227
+ "Charles Darwin",
228
+ "John von Neumann",
229
+ ]
230
+ }}
231
+
232
+ EVALUATION: {"related_scientists": 0.75}
233
+
234
+ Remember, be sure to output your evaluation as a dictionary where each value contains a 0.0 or 1.0 score for each output field (or a score within [0.0, 1.0] for list output fields).
235
+
236
+ INPUT MESSAGES:
237
+ ---------------
238
+
239
+ """
@@ -2,13 +2,18 @@ import logging
2
2
 
3
3
  import numpy as np
4
4
 
5
- from palimpzest.core.data.dataclasses import SentinelPlanStats
5
+ from palimpzest.core.data.dataset import Dataset
6
6
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
7
+ from palimpzest.core.models import SentinelPlanStats
7
8
  from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
9
+ from palimpzest.query.operators.aggregate import AggregateOp
10
+ from palimpzest.query.operators.filter import FilterOp
11
+ from palimpzest.query.operators.join import JoinOp
8
12
  from palimpzest.query.operators.physical import PhysicalOperator
9
- from palimpzest.query.operators.scan import ScanPhysicalOp
13
+ from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
10
14
  from palimpzest.query.optimizer.plan import SentinelPlan
11
15
  from palimpzest.utils.progress import create_progress_manager
16
+ from palimpzest.validator.validator import Validator
12
17
 
13
18
  logger = logging.getLogger(__name__)
14
19
 
@@ -21,36 +26,79 @@ class OpSet:
21
26
  2. has been sampled fewer than j times
22
27
  """
23
28
 
24
- def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int]):
29
+ def __init__(self, op_set: list[PhysicalOperator], source_unique_logical_op_ids: list[str], source_indices: list[int]):
25
30
  # construct the set of operators
26
31
  self.ops = op_set
27
32
 
28
33
  # store the order in which we will sample the source records
29
34
  self.source_indices = source_indices
30
35
 
36
+ # boolean indication of the type of operator in this OpSet
37
+ sample_op = op_set[0]
38
+ self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
39
+ self.is_filter_op = isinstance(sample_op, FilterOp)
40
+ self.is_aggregate_op = isinstance(sample_op, AggregateOp)
41
+ self.is_llm_join = isinstance(sample_op, JoinOp)
42
+
31
43
  # set the initial inputs for this logical operator
32
- is_scan_op = isinstance(op_set[0], ScanPhysicalOp)
33
- self.source_idx_to_input = {source_idx: [source_idx] for source_idx in self.source_indices} if is_scan_op else {}
44
+ self.source_indices_to_inputs = {source_unique_logical_op_id: {} for source_unique_logical_op_id in source_unique_logical_op_ids}
45
+ if self.is_scan_op:
46
+ self.source_indices_to_inputs["source"] = {source_idx: [int(source_idx.split("-")[-1])] for source_idx in self.source_indices}
34
47
 
35
- def get_op_input_pairs(self) -> list[PhysicalOperator, DataRecord | int | None]:
48
+ def get_op_inputs(self) -> list[PhysicalOperator, DataRecord | int | None]:
36
49
  """
37
- Returns the list of frontier operators and their next input to process. If there are
38
- any indices in `source_indices_to_sample` which this operator does not sample on its own, then
39
- we also have this frontier process that source_idx's input with its max quality operator.
50
+ Returns the list of frontier operators and their next input to process.
40
51
  """
41
- # get the list of (op, source_idx) pairs which this operator needs to execute
42
- op_source_idx_pairs = []
52
+ # if this is an aggregate, run on every input
53
+ if self.is_aggregate_op:
54
+ op = self.ops[0]
55
+ all_inputs = []
56
+ for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
57
+ for _, inputs in source_indices_to_inputs.items():
58
+ all_inputs.extend(inputs)
59
+ return [(op, tuple(), all_inputs)]
60
+
61
+ # if this is an un-optimized (non-scan, non-join) operator, flatten inputs and run on each one
62
+ elif not self.is_scan_op and not self.is_llm_join and len(self.ops) == 1:
63
+ op_inputs = []
64
+ op = self.ops[0]
65
+ for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
66
+ for source_indices, inputs in source_indices_to_inputs.items():
67
+ for input in inputs:
68
+ op_inputs.append((op, source_indices, input))
69
+ return op_inputs
70
+
71
+ # get the list of (op, source_indices) pairs which this operator needs to execute
72
+ op_source_indices_pairs = []
43
73
  for op in self.ops:
44
- # construct list of inputs by looking up the input for the given source_idx
45
- for source_idx in self.source_indices:
46
- op_source_idx_pairs.append((op, source_idx))
47
-
48
- # fetch the corresponding (op, input) pairs
49
- op_input_pairs = []
50
- for op, source_idx in op_source_idx_pairs:
51
- op_input_pairs.extend([(op, input_record) for input_record in self.source_idx_to_input[source_idx]])
52
-
53
- return op_input_pairs
74
+ # construct list of inputs by looking up the input for the given source_indices
75
+ for source_indices in self.source_indices:
76
+ op_source_indices_pairs.append((op, source_indices))
77
+
78
+ # construct the op inputs
79
+ op_inputs = []
80
+ if self.is_llm_join:
81
+ left_source_unique_logical_op_id, right_source_unique_logical_op_id = list(self.source_indices_to_inputs)
82
+ left_source_indices_to_inputs = self.source_indices_to_inputs[left_source_unique_logical_op_id]
83
+ right_source_indices_to_inputs = self.source_indices_to_inputs[right_source_unique_logical_op_id]
84
+ for op, source_indices in op_source_indices_pairs:
85
+ left_source_indices = source_indices[0]
86
+ right_source_indices = source_indices[1]
87
+ left_inputs = left_source_indices_to_inputs.get(left_source_indices, [])
88
+ right_inputs = right_source_indices_to_inputs.get(right_source_indices, [])
89
+ if len(left_inputs) > 0 and len(right_inputs) > 0:
90
+ op_inputs.append((op, (left_source_indices, right_source_indices), (left_inputs, right_inputs)))
91
+ return op_inputs
92
+
93
+ # if operator is not a join
94
+ source_unique_logical_op_id = list(self.source_indices_to_inputs)[0]
95
+ op_inputs = [
96
+ (op, source_indices, input)
97
+ for op, source_indices in op_source_indices_pairs
98
+ for input in self.source_indices_to_inputs[source_unique_logical_op_id].get(source_indices, [])
99
+ ]
100
+
101
+ return op_inputs
54
102
 
55
103
  def pick_highest_quality_output(self, record_sets: list[DataRecordSet]) -> DataRecordSet:
56
104
  # if there's only one operator in the set, we return its record_set
@@ -100,75 +148,64 @@ class OpSet:
100
148
  for record in max_quality_record_set:
101
149
  input.append(record if record.passed_operator else None)
102
150
 
103
- self.source_idx_to_input[source_idx] = input
104
-
151
+ self.source_indices_to_inputs[source_idx] = input
105
152
 
106
153
  class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
107
154
 
108
- def _get_source_indices(self):
109
- """Get the list of source indices which the sentinel plan should execute over."""
110
- # create list of all source indices and shuffle it
111
- total_num_samples = len(self.val_datasource)
112
- source_indices = list(np.arange(total_num_samples))
113
-
114
- return source_indices
115
-
116
155
  def _execute_sentinel_plan(self,
117
156
  plan: SentinelPlan,
118
157
  op_sets: dict[str, OpSet],
119
- expected_outputs: dict[int, dict] | None,
158
+ validator: Validator,
120
159
  plan_stats: SentinelPlanStats,
121
160
  ) -> SentinelPlanStats:
122
161
  # execute operator sets in sequence
123
- for op_idx, (logical_op_id, op_set) in enumerate(plan):
162
+ for topo_idx, (logical_op_id, _) in enumerate(plan):
163
+ # compute unique logical op id within plan
164
+ unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
165
+
124
166
  # get frontier ops and their next input
125
- op_input_pairs = op_sets[logical_op_id].get_op_input_pairs()
167
+ op_inputs = op_sets[logical_op_id].get_op_inputs()
126
168
 
127
- # break out of the loop if op_input_pairs is empty, as this means all records have been filtered out
128
- if len(op_input_pairs) == 0:
169
+ # break out of the loop if op_inputs is empty, as this means all records have been filtered out
170
+ if len(op_inputs) == 0:
129
171
  break
130
172
 
131
173
  # run sampled operators on sampled inputs
132
- source_idx_to_record_sets_and_ops, _ = self._execute_op_set(op_input_pairs)
133
-
134
- # FUTURE TODO: have this return the highest quality record set simply based on our posterior (or prior) belief on operator quality
135
- # get the target record set for each source_idx
136
- source_idx_to_target_record_set = self._get_target_record_sets(logical_op_id, source_idx_to_record_sets_and_ops, expected_outputs)
174
+ source_indices_to_record_set_tuples, _ = self._execute_op_set(unique_logical_op_id, op_inputs)
137
175
 
138
- # TODO: make consistent across here and RandomSampling
139
- # FUTURE TODO: move this outside of the loop (i.e. assume we only get quality label(s) after executing full program)
140
176
  # score the quality of each generated output
141
- physical_op_cls = op_set[0].__class__
142
- source_idx_to_record_sets = {
143
- source_idx: list(map(lambda tup: tup[0], record_sets_and_ops))
144
- for source_idx, record_sets_and_ops in source_idx_to_record_sets_and_ops.items()
145
- }
146
- source_idx_to_record_sets = self._score_quality(physical_op_cls, source_idx_to_record_sets, source_idx_to_target_record_set)
147
-
148
- # flatten the lists of records and record_op_stats
149
- all_records, all_record_op_stats = self._flatten_record_sets(source_idx_to_record_sets)
177
+ source_indices_to_all_record_sets = {
178
+ source_indices: [(record_set, op) for record_set, op, _ in record_set_tuples]
179
+ for source_indices, record_set_tuples in source_indices_to_record_set_tuples.items()
180
+ }
181
+ source_indices_to_all_record_sets, val_gen_stats = self._score_quality(validator, source_indices_to_all_record_sets)
182
+
183
+ # remove records that were read from the execution cache before adding to record op stats
184
+ new_record_op_stats = []
185
+ for _, record_set_tuples in source_indices_to_record_set_tuples.items():
186
+ for record_set, _, is_new in record_set_tuples:
187
+ if is_new:
188
+ new_record_op_stats.extend(record_set.record_op_stats)
150
189
 
151
190
  # update plan stats
152
- plan_stats.add_record_op_stats(all_record_op_stats)
153
-
154
- # add records (which are not filtered) to the cache, if allowed
155
- self._add_records_to_cache(logical_op_id, all_records)
156
-
157
- # FUTURE TODO: simply set input based on source_idx_to_target_record_set (b/c we won't have scores computed)
158
- # provide the champion record sets as inputs to the next logical operator
159
- if op_idx + 1 < len(plan):
160
- next_logical_op_id = plan.logical_op_ids[op_idx + 1]
161
- op_sets[next_logical_op_id].update_inputs(source_idx_to_record_sets)
162
-
163
- # close the cache
164
- self._close_cache(plan.logical_op_ids)
191
+ plan_stats.add_record_op_stats(unique_logical_op_id, new_record_op_stats)
192
+ plan_stats.add_validation_gen_stats(unique_logical_op_id, val_gen_stats)
193
+
194
+ # provide the best record sets as inputs to the next logical operator
195
+ next_unique_logical_op_id = plan.get_next_unique_logical_op_id(unique_logical_op_id)
196
+ if next_unique_logical_op_id is not None:
197
+ source_indices_to_all_record_sets = {
198
+ source_indices: [record_set for record_set, _ in record_set_tuples]
199
+ for source_indices, record_set_tuples in source_indices_to_all_record_sets.items()
200
+ }
201
+ op_sets[next_unique_logical_op_id].update_inputs(unique_logical_op_id, source_indices_to_all_record_sets)
165
202
 
166
203
  # finalize plan stats
167
204
  plan_stats.finish()
168
205
 
169
206
  return plan_stats
170
207
 
171
- def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[int, dict] | None):
208
+ def execute_sentinel_plan(self, plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator): # expected_outputs: dict[int, dict] | None):
172
209
  """
173
210
  NOTE: this function currently requires us to set k and j properly in order to make
174
211
  comparison in our research against the corresponding sample budget in MAB.
@@ -177,8 +214,6 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
177
214
  calls does not perfectly match the sample_budget. This may cause some minor discrepancies with
178
215
  the progress manager as a result.
179
216
  """
180
- # for now, assert that the first operator in the plan is a ScanPhysicalOp
181
- assert all(isinstance(op, ScanPhysicalOp) for op in plan.operator_sets[0]), "First operator in physical plan must be a ScanPhysicalOp"
182
217
  logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
183
218
  logger.info(f"Plan Details: {plan}")
184
219
 
@@ -186,25 +221,48 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
186
221
  plan_stats = SentinelPlanStats.from_plan(plan)
187
222
  plan_stats.start()
188
223
 
189
- # get list of source indices which can be sampled from
190
- source_indices = self._get_source_indices()
224
+ # get lists of source indices
225
+ dataset_id_to_source_indices = {}
226
+ for dataset_id, dataset in train_dataset.items():
227
+ total_num_samples = len(dataset)
228
+ source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
229
+ dataset_id_to_source_indices[dataset_id] = source_indices
191
230
 
192
231
  # initialize set of physical operators for each logical operator
193
- op_sets = {
194
- logical_op_id: OpSet(op_set, source_indices)
195
- for logical_op_id, op_set in plan
196
- }
232
+ op_sets = {}
233
+ for topo_idx, (logical_op_id, op_set) in enumerate(plan):
234
+ unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
235
+ source_unique_logical_op_ids = plan.get_source_unique_logical_op_ids(unique_logical_op_id)
236
+ sample_op = op_set[0]
237
+ if isinstance(sample_op, (ScanPhysicalOp, ContextScanOp)):
238
+ root_dataset_ids = plan.get_root_dataset_ids(unique_logical_op_id)
239
+ assert len(root_dataset_ids) == 1, f"Scan for {sample_op} has {len(root_dataset_ids)} > 1 root dataset ids"
240
+ root_dataset_id = root_dataset_ids[0]
241
+ source_indices = dataset_id_to_source_indices[root_dataset_id]
242
+ op_sets[unique_logical_op_id] = OpSet(op_set, source_unique_logical_op_ids, source_indices)
243
+ elif isinstance(sample_op, JoinOp):
244
+ assert len(source_unique_logical_op_ids) == 2, f"Join for {sample_op} has {len(source_unique_logical_op_ids)} != 2 source logical operators"
245
+ left_source_indices = op_sets[source_unique_logical_op_ids[0]].source_indices
246
+ right_source_indices = op_sets[source_unique_logical_op_ids[1]].source_indices
247
+ source_indices = []
248
+ for left_source_idx in left_source_indices:
249
+ for right_source_idx in right_source_indices:
250
+ source_indices.append((left_source_idx, right_source_idx))
251
+ op_sets[unique_logical_op_id] = OpSet(op_set, source_unique_logical_op_ids, source_indices)
252
+ else:
253
+ source_indices = op_sets[source_unique_logical_op_ids[0]].source_indices
254
+ op_sets[unique_logical_op_id] = OpSet(op_set, source_unique_logical_op_ids, source_indices)
197
255
 
198
256
  # initialize and start the progress manager
199
257
  self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, progress=self.progress)
200
258
  self.progress_manager.start()
201
259
 
202
- # NOTE: we must handle progress manager outside of _exeecute_sentinel_plan to ensure that it is shut down correctly;
260
+ # NOTE: we must handle progress manager outside of _execute_sentinel_plan to ensure that it is shut down correctly;
203
261
  # if we don't have the `finally:` branch, then program crashes can cause future program runs to fail because
204
262
  # the progress manager cannot get a handle to the console
205
263
  try:
206
264
  # execute sentinel plan by sampling records and operators
207
- plan_stats = self._execute_sentinel_plan(plan, op_sets, expected_outputs, plan_stats)
265
+ plan_stats = self._execute_sentinel_plan(plan, op_sets, validator, plan_stats)
208
266
 
209
267
  finally:
210
268
  # finish progress tracking