palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. palimpzest/constants.py +13 -4
  2. palimpzest/core/data/dataset.py +75 -5
  3. palimpzest/core/elements/groupbysig.py +5 -1
  4. palimpzest/core/elements/records.py +16 -7
  5. palimpzest/core/lib/schemas.py +26 -3
  6. palimpzest/core/models.py +4 -4
  7. palimpzest/prompts/aggregate_prompts.py +99 -0
  8. palimpzest/prompts/prompt_factory.py +162 -75
  9. palimpzest/prompts/utils.py +38 -1
  10. palimpzest/prompts/validator.py +24 -24
  11. palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  12. palimpzest/query/execution/execution_strategy.py +8 -8
  13. palimpzest/query/execution/mab_execution_strategy.py +30 -11
  14. palimpzest/query/execution/parallel_execution_strategy.py +31 -7
  15. palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
  16. palimpzest/query/generators/generators.py +9 -7
  17. palimpzest/query/operators/__init__.py +10 -6
  18. palimpzest/query/operators/aggregate.py +394 -10
  19. palimpzest/query/operators/convert.py +1 -1
  20. palimpzest/query/operators/join.py +279 -23
  21. palimpzest/query/operators/logical.py +36 -11
  22. palimpzest/query/operators/mixture_of_agents.py +3 -1
  23. palimpzest/query/operators/physical.py +5 -2
  24. palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
  25. palimpzest/query/optimizer/__init__.py +11 -3
  26. palimpzest/query/optimizer/cost_model.py +5 -5
  27. palimpzest/query/optimizer/optimizer.py +3 -2
  28. palimpzest/query/optimizer/plan.py +2 -3
  29. palimpzest/query/optimizer/rules.py +73 -13
  30. palimpzest/query/optimizer/tasks.py +4 -4
  31. palimpzest/utils/progress.py +19 -17
  32. palimpzest/validator/validator.py +7 -7
  33. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
  34. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
  35. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
  36. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0
@@ -22,17 +22,17 @@ OUTPUT FIELDS:
22
22
  - birth_year: the year the scientist was born
23
23
 
24
24
  CONTEXT:
25
- {{
25
+ {
26
26
  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
27
27
  "birthday": "December 10, 1815"
28
- }}
28
+ }
29
29
 
30
30
  OUTPUT:
31
31
  --------
32
- {{
32
+ {
33
33
  "name": "Charles Babbage",
34
34
  "birth_year": 1815
35
- }}
35
+ }
36
36
 
37
37
  EVALUATION: {"name": 0.0, "birth_year": 1.0}
38
38
 
@@ -66,18 +66,18 @@ OUTPUT FIELDS:
66
66
  - person_in_image: true if a person is in the image and false otherwise
67
67
 
68
68
  CONTEXT:
69
- {{
69
+ {
70
70
  "image": <bytes>,
71
71
  "photographer": "CameraEnthusiast1"
72
- }}
72
+ }
73
73
  <image content provided here; assume in this example the image shows a dog and a cat playing>
74
74
 
75
75
  OUTPUT:
76
76
  --------
77
- {{
77
+ {
78
78
  "dog_in_image": true,
79
79
  "person_in_image": true
80
- }}
80
+ }
81
81
 
82
82
  EVALUATION: {"dog_in_image": 1.0, "person_in_image": 0.0}
83
83
 
@@ -113,22 +113,22 @@ OUTPUT FIELDS:
113
113
  - birth_year: the year the scientist was born
114
114
 
115
115
  CONTEXT:
116
- {{
116
+ {
117
117
  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
118
118
  "birthdays": "...Lovelace was born on December 10, 1815, almost exactly 24 years after Babbage's birth on 26 December 1791..."
119
- }}
119
+ }
120
120
 
121
121
  OUTPUTS:
122
122
  --------
123
123
  [
124
- {{
124
+ {
125
125
  "name": "Ada Lovelace",
126
126
  "birth_year": 1815
127
- }},
128
- {{
127
+ },
128
+ {
129
129
  "name": "Charles Babbage",
130
130
  "birth_year": 1790
131
- }}
131
+ }
132
132
  ]
133
133
 
134
134
  EVALUATION: [{"name": 1.0, "birth_year": 1.0}, {"name": 1.0, "birth_year": 0.0}]
@@ -163,23 +163,23 @@ OUTPUT FIELDS:
163
163
  - animal_is_canine: true if the animal is a canine and false otherwise
164
164
 
165
165
  CONTEXT:
166
- {{
166
+ {
167
167
  "image": <bytes>,
168
168
  "photographer": "CameraEnthusiast1"
169
- }}
169
+ }
170
170
  <image content provided here; assume in this example the image shows a dog and a cat playing>
171
171
 
172
172
  OUTPUT:
173
173
  --------
174
174
  [
175
- {{
175
+ {
176
176
  "animal": "dog",
177
177
  "animal_is_canine": true
178
- }},
179
- {{
178
+ },
179
+ {
180
180
  "animal": "cat",
181
181
  "animal_is_canine": true
182
- }}
182
+ }
183
183
  ]
184
184
 
185
185
  EVALUATION: [{"animal": 1.0, "animal_is_canine": 1.0}, {"animal": 1.0, "animal_is_canine": 0.0}]
@@ -214,20 +214,20 @@ OUTPUT FIELDS:
214
214
  - related_scientists: list of scientists who perform similar work as the scientist described in the text
215
215
 
216
216
  CONTEXT:
217
- {{
217
+ {
218
218
  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
219
- }}
219
+ }
220
220
 
221
221
  OUTPUT:
222
222
  --------
223
- {{
223
+ {
224
224
  "related_scientists": [
225
225
  "Charles Babbage",
226
226
  "Alan Turing",
227
227
  "Charles Darwin",
228
228
  "John von Neumann",
229
229
  ]
230
- }}
230
+ }
231
231
 
232
232
  EVALUATION: {"related_scientists": 0.75}
233
233
 
@@ -225,7 +225,7 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
225
225
  dataset_id_to_source_indices = {}
226
226
  for dataset_id, dataset in train_dataset.items():
227
227
  total_num_samples = len(dataset)
228
- source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
228
+ source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
229
229
  dataset_id_to_source_indices[dataset_id] = source_indices
230
230
 
231
231
  # initialize set of physical operators for each logical operator
@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
14
14
  from palimpzest.query.operators.filter import LLMFilter
15
15
  from palimpzest.query.operators.join import JoinOp
16
16
  from palimpzest.query.operators.physical import PhysicalOperator
17
- from palimpzest.query.operators.retrieve import RetrieveOp
18
17
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
18
+ from palimpzest.query.operators.topk import TopKOp
19
19
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
20
20
  from palimpzest.utils.progress import PZSentinelProgressManager
21
21
  from palimpzest.validator.validator import Validator
@@ -123,7 +123,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
123
123
  return (
124
124
  not isinstance(op, LLMConvert)
125
125
  and not isinstance(op, LLMFilter)
126
- and not isinstance(op, RetrieveOp)
126
+ and not isinstance(op, TopKOp)
127
127
  and not isinstance(op, JoinOp)
128
128
  )
129
129
 
@@ -167,8 +167,8 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
167
167
  full_hashes.add(full_hash)
168
168
  futures.append(executor.submit(validator._score_flat_map, op, fields, input_record, output, full_hash))
169
169
 
170
- # create future for retrieve
171
- elif isinstance(op, RetrieveOp):
170
+ # create future for top-k
171
+ elif isinstance(op, TopKOp):
172
172
  fields = op.generated_fields
173
173
  input_record: DataRecord = record_set.input
174
174
  output = record_set.data_records[0].to_dict(project_cols=fields)
@@ -176,7 +176,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
176
176
  full_hash = f"{hash(input_record)}{hash(output_str)}"
177
177
  if full_hash not in full_hashes:
178
178
  full_hashes.add(full_hash)
179
- futures.append(executor.submit(validator._score_retrieve, op, fields, input_record, output, full_hash))
179
+ futures.append(executor.submit(validator._score_topk, op, fields, input_record, output, full_hash))
180
180
 
181
181
  # create future for filter
182
182
  elif isinstance(op, LLMFilter):
@@ -235,7 +235,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
235
235
 
236
236
  # TODO: this scoring function will (likely) bias towards small values of k since it
237
237
  # measures precision and not recall / F1; will need to revisit this in the future
238
- elif isinstance(op, RetrieveOp):
238
+ elif isinstance(op, TopKOp):
239
239
  fields = op.generated_fields
240
240
  input_record: DataRecord = record_set.input
241
241
  output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
@@ -341,9 +341,9 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
341
341
  def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
342
342
  is_llm_convert = isinstance(physical_op, LLMConvert)
343
343
  is_llm_filter = isinstance(physical_op, LLMFilter)
344
- is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
344
+ is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
345
345
  is_llm_join = isinstance(physical_op, JoinOp)
346
- return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
346
+ return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
347
347
 
348
348
  @abstractmethod
349
349
  def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator) -> SentinelPlanStats:
@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
14
14
  from palimpzest.query.operators.filter import FilterOp, LLMFilter, NonLLMFilter
15
15
  from palimpzest.query.operators.join import JoinOp
16
16
  from palimpzest.query.operators.physical import PhysicalOperator
17
- from palimpzest.query.operators.retrieve import RetrieveOp
18
17
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
18
+ from palimpzest.query.operators.topk import TopKOp
19
19
  from palimpzest.query.optimizer.plan import SentinelPlan
20
20
  from palimpzest.utils.progress import create_progress_manager
21
21
  from palimpzest.validator.validator import Validator
@@ -66,8 +66,8 @@ class OpFrontier:
66
66
  self.is_llm_join = isinstance(sample_op, JoinOp)
67
67
  is_llm_convert = isinstance(sample_op, LLMConvert)
68
68
  is_llm_filter = isinstance(sample_op, LLMFilter)
69
- is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
70
- self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
69
+ is_llm_topk = isinstance(sample_op, TopKOp) and isinstance(sample_op.index, Collection)
70
+ self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_topk or self.is_llm_join
71
71
 
72
72
  # get order in which we will sample physical operators for this logical operator
73
73
  sample_op_indices = self._get_op_index_order(op_set, seed)
@@ -96,6 +96,12 @@ class OpFrontier:
96
96
  """
97
97
  return self.frontier_ops
98
98
 
99
+ def get_off_frontier_ops(self) -> list[PhysicalOperator]:
100
+ """
101
+ Returns the set of off-frontier operators for this OpFrontier.
102
+ """
103
+ return self.off_frontier_ops
104
+
99
105
  def _compute_op_id_to_pareto_distance(self, priors: dict[str, dict[str, float]]) -> dict[str, float]:
100
106
  """
101
107
  Return l2-distance for each operator from the pareto frontier.
@@ -298,7 +304,7 @@ class OpFrontier:
298
304
  def remove_unavailable_root_datasets(source_indices: str | tuple) -> str | tuple | None:
299
305
  # base case: source_indices is a string
300
306
  if isinstance(source_indices, str):
301
- return source_indices if source_indices.split("-")[0] in self.root_dataset_ids else None
307
+ return source_indices if source_indices.split("---")[0] in self.root_dataset_ids else None
302
308
 
303
309
  # recursive case: source_indices is a tuple
304
310
  left_indices = source_indices[0]
@@ -383,6 +389,12 @@ class OpFrontier:
383
389
  # compute final list of record op stats
384
390
  full_op_id_to_record_op_stats[full_op_id] = list(record_id_to_max_quality_record_op_stats.values())
385
391
 
392
+ # NOTE: it is possible for the full_op_id_to_record_op_stats to be empty if there is a duplicate operator
393
+ # (e.g. a scan of the same dataset) which has all of its results cached and no new_record_op_stats;
394
+ # in this case, we do not update the frontier
395
+ if full_op_id_to_record_op_stats == {}:
396
+ return
397
+
386
398
  # update the set of source indices processed by each physical operator
387
399
  for full_op_id, source_indices_processed in full_op_id_to_source_indices_processed.items():
388
400
  # update the set of source indices processed
@@ -641,8 +653,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
641
653
  """
642
654
  Returns the operator in the frontier with the highest (estimated) quality.
643
655
  """
644
- # get the operators in the frontier set for this logical_op_id
645
- frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops()
656
+ # get the (off) frontier operators for this logical_op_id
657
+ frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops() + op_frontiers[unique_logical_op_id].get_off_frontier_ops()
646
658
 
647
659
  # get a mapping from full_op_id --> list[RecordOpStats]
648
660
  full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(unique_logical_op_id, {})
@@ -693,14 +705,21 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
693
705
  max_quality_op = self._get_max_quality_op(unique_logical_op_id, op_frontiers, plan_stats)
694
706
 
695
707
  # get frontier ops and their next input
696
- def is_filtered_out(tup: tuple) -> bool:
697
- return tup[-1] is None or isinstance(tup[-1], list) and all([record is None for record in tup[-1]])
708
+ def filter_and_clean_inputs(frontier_op_inputs: list[tuple]) -> bool:
709
+ cleaned_inputs = []
710
+ for tup in frontier_op_inputs:
711
+ input = tup[-1]
712
+ if isinstance(input, list):
713
+ input = [record for record in input if record is not None]
714
+ if input is not None and input != []:
715
+ cleaned_inputs.append((tup[0], tup[1], input))
716
+ return cleaned_inputs
698
717
  frontier_op_inputs = op_frontiers[unique_logical_op_id].get_frontier_op_inputs(source_indices_to_sample, max_quality_op)
699
- frontier_op_inputs = list(filter(lambda tup: not is_filtered_out(tup), frontier_op_inputs))
718
+ frontier_op_inputs = filter_and_clean_inputs(frontier_op_inputs)
700
719
 
701
720
  # break out of the loop if frontier_op_inputs is empty, as this means all records have been filtered out
702
721
  if len(frontier_op_inputs) == 0:
703
- break
722
+ continue
704
723
 
705
724
  # run sampled operators on sampled inputs and update the number of samples drawn
706
725
  source_indices_to_record_set_tuples, num_llm_ops = self._execute_op_set(unique_logical_op_id, frontier_op_inputs)
@@ -764,7 +783,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
764
783
  dataset_id_to_shuffled_source_indices = {}
765
784
  for dataset_id, dataset in train_dataset.items():
766
785
  total_num_samples = len(dataset)
767
- shuffled_source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
786
+ shuffled_source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
768
787
  self.rng.shuffle(shuffled_source_indices)
769
788
  dataset_id_to_shuffled_source_indices[dataset_id] = shuffled_source_indices
770
789
 
@@ -9,7 +9,6 @@ from palimpzest.query.operators.aggregate import AggregateOp
9
9
  from palimpzest.query.operators.distinct import DistinctOp
10
10
  from palimpzest.query.operators.join import JoinOp
11
11
  from palimpzest.query.operators.limit import LimitScanOp
12
- from palimpzest.query.operators.physical import PhysicalOperator
13
12
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
14
13
  from palimpzest.query.optimizer.plan import PhysicalPlan
15
14
  from palimpzest.utils.progress import create_progress_manager
@@ -35,14 +34,27 @@ class ParallelExecutionStrategy(ExecutionStrategy):
35
34
  return True
36
35
  return False
37
36
 
38
- def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
37
+ def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
39
38
  """Helper function to check if agg / join operator is ready to process its inputs."""
40
- # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
41
- upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
39
+ upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
42
40
  upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
43
41
  upstream_future_queues = {upstream_unique_full_op_id: future_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
44
42
  return not (self._any_queue_not_empty(upstream_input_queues) or self._any_queue_not_empty(upstream_future_queues))
45
43
 
44
+ def _finish_outer_join(self, executor: ThreadPoolExecutor, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> None:
45
+ join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
46
+ join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
47
+ join_future_queue_empty = len(future_queues[unique_full_op_id]) == 0
48
+ if join_op_upstream_finished and join_input_queues_empty and join_future_queue_empty:
49
+ # process the join one last time with final=True to handle any left/right/outer join logic
50
+ operator = self.unique_full_op_id_to_operator[unique_full_op_id]
51
+ if not operator.finished:
52
+ def finalize_op(operator):
53
+ return operator([], [], final=True)
54
+ future = executor.submit(finalize_op, operator)
55
+ future_queues[unique_full_op_id].append(future)
56
+ operator.set_finished()
57
+
46
58
  def _process_future_results(self, unique_full_op_id: str, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
47
59
  """
48
60
  Helper function which takes a full operator id, the future queues, and plan stats, and performs
@@ -117,15 +129,23 @@ class ParallelExecutionStrategy(ExecutionStrategy):
117
129
  records = self._process_future_results(source_unique_full_op_id, future_queues, plan_stats)
118
130
  input_queues[unique_full_op_id][source_unique_full_op_id].extend(records)
119
131
 
132
+ # if the source is a left/right/outer join operator with no more inputs to process, then finish it
133
+ if self.is_outer_join_op[source_unique_full_op_id]:
134
+ self._finish_outer_join(executor, plan, source_unique_full_op_id, input_queues, future_queues)
135
+
120
136
  # for the final operator, add any finished futures to the output_records
121
137
  if unique_full_op_id == f"{topo_idx}-{final_op.get_full_op_id()}":
122
138
  records = self._process_future_results(unique_full_op_id, future_queues, plan_stats)
123
139
  output_records.extend(records)
124
140
 
141
+ # if this is a left/right/outer join operator with no more inputs to process, then finish it
142
+ if self.is_outer_join_op[unique_full_op_id]:
143
+ self._finish_outer_join(executor, plan, unique_full_op_id, input_queues, future_queues)
144
+
125
145
  # if this operator does not have enough inputs to execute, then skip it
126
146
  num_inputs = sum(len(inputs) for inputs in input_queues[unique_full_op_id].values())
127
- agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
128
- join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
147
+ agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
148
+ join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
129
149
  if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
130
150
  continue
131
151
 
@@ -225,8 +245,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
225
245
  input_queues = self._create_input_queues(plan)
226
246
  future_queues = {f"{topo_idx}-{op.get_full_op_id()}": [] for topo_idx, op in enumerate(plan)}
227
247
 
228
- # precompute which operators are joins and which joins have downstream limit ops
248
+ # precompute which operators are (outer) joins and which joins have downstream limit ops
229
249
  self.is_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) for topo_idx, op in enumerate(plan)}
250
+ self.is_outer_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) and op.how in ("left", "right", "outer") for topo_idx, op in enumerate(plan)}
230
251
  self.join_has_downstream_limit_op = {}
231
252
  for topo_idx, op in enumerate(plan):
232
253
  if isinstance(op, JoinOp):
@@ -240,6 +261,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
240
261
  break
241
262
  self.join_has_downstream_limit_op[unique_full_op_id] = has_downstream_limit_op
242
263
 
264
+ # precompute mapping from unique_full_op_id to operator instance
265
+ self.unique_full_op_id_to_operator = {f"{topo_idx}-{op.get_full_op_id()}": op for topo_idx, op in enumerate(plan)}
266
+
243
267
  # initialize and start the progress manager
244
268
  self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
245
269
  self.progress_manager.start()
@@ -6,7 +6,6 @@ from palimpzest.query.execution.execution_strategy import ExecutionStrategy
6
6
  from palimpzest.query.operators.aggregate import AggregateOp
7
7
  from palimpzest.query.operators.join import JoinOp
8
8
  from palimpzest.query.operators.limit import LimitScanOp
9
- from palimpzest.query.operators.physical import PhysicalOperator
10
9
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
11
10
  from palimpzest.query.optimizer.plan import PhysicalPlan
12
11
  from palimpzest.utils.progress import create_progress_manager
@@ -70,6 +69,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
70
69
  record_set, num_inputs_processed = operator(left_input_records, right_input_records)
71
70
  records = record_set.data_records
72
71
  record_op_stats = record_set.record_op_stats
72
+
73
+ # process the join one last time with final=True to handle any left/right/outer join logic
74
+ if operator.how in ("left", "right", "outer"):
75
+ record_set, num_inputs_processed = operator([], [], final=True)
76
+ records.extend(record_set.data_records)
77
+ record_op_stats.extend(record_set.record_op_stats)
78
+
73
79
  num_outputs = sum(record._passed_operator for record in records)
74
80
 
75
81
  # update the progress manager
@@ -168,10 +174,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
168
174
  return True
169
175
  return False
170
176
 
171
- def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]]) -> bool:
177
+ def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]]) -> bool:
172
178
  """Helper function to check if agg / join operator is ready to process its inputs."""
173
- # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
174
- upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
179
+ upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
175
180
  upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
176
181
  return not self._any_queue_not_empty(upstream_input_queues)
177
182
 
@@ -192,8 +197,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
192
197
  unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
193
198
 
194
199
  num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
195
- agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
196
- join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
200
+ agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
201
+ join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
197
202
  if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
198
203
  continue
199
204
 
@@ -242,6 +247,18 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
242
247
  # update the progress manager
243
248
  self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
244
249
 
250
+ # if this is a join operator with no more inputs to process, then finish it
251
+ if isinstance(operator, JoinOp) and operator.how in ("left", "right", "outer"):
252
+ join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
253
+ join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
254
+ if join_op_upstream_finished and join_input_queues_empty and not operator.finished:
255
+ # process the join one last time with final=True to handle any left/right/outer join logic
256
+ record_set, num_inputs_processed = operator([], [], final=True)
257
+ records.extend(record_set.data_records)
258
+ record_op_stats.extend(record_set.record_op_stats)
259
+ num_outputs += sum(record._passed_operator for record in record_set.data_records)
260
+ operator.set_finished()
261
+
245
262
  # update plan stats
246
263
  plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
247
264
 
@@ -296,9 +296,9 @@ class Generator(Generic[ContextType, InputType]):
296
296
 
297
297
  return field_answers
298
298
 
299
- def __call__(self, candidate: DataRecord, fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
300
- """Take the input record (`candidate`), generate the output `fields`, and return the generated output."""
301
- logger.debug(f"Generating for candidate {candidate} with fields {fields}")
299
+ def __call__(self, candidate: DataRecord | list[DataRecord], fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
300
+ """Take the input record(s) (`candidate`), generate the output `fields`, and return the generated output."""
301
+ logger.debug(f"Generating for candidate(s) {candidate} with fields {fields}")
302
302
 
303
303
  # fields can only be None if the user provides an answer parser
304
304
  fields_check = fields is not None or "parse_answer" in kwargs
@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
338
338
  reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
339
339
  completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
340
340
  if self.model.is_vllm_model():
341
- completion_kwargs = {"api_base": self.api_base, **completion_kwargs}
341
+ completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
342
342
  completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
343
343
  end_time = time.time()
344
344
  logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")
@@ -405,15 +405,17 @@ class Generator(Generic[ContextType, InputType]):
405
405
 
406
406
  # pretty print prompt + full completion output for debugging
407
407
  completion_text = completion.choices[0].message.content
408
- prompt = ""
408
+ prompt, system_prompt = "", ""
409
409
  for message in messages:
410
+ if message["role"] == "system":
411
+ system_prompt += message["content"] + "\n"
410
412
  if message["role"] == "user":
411
413
  if message["type"] == "text":
412
414
  prompt += message["content"] + "\n"
413
415
  elif message["type"] == "image":
414
- prompt += "<image>\n"
416
+ prompt += "<image>\n" * len(message["content"])
415
417
  elif message["type"] == "input_audio":
416
- prompt += "<audio>\n"
418
+ prompt += "<audio>\n" * len(message["content"])
417
419
  logger.debug(f"PROMPT:\n{prompt}")
418
420
  logger.debug(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)
419
421
 
@@ -2,6 +2,10 @@ from palimpzest.query.operators.aggregate import AggregateOp as _AggregateOp
2
2
  from palimpzest.query.operators.aggregate import ApplyGroupByOp as _ApplyGroupByOp
3
3
  from palimpzest.query.operators.aggregate import AverageAggregateOp as _AverageAggregateOp
4
4
  from palimpzest.query.operators.aggregate import CountAggregateOp as _CountAggregateOp
5
+ from palimpzest.query.operators.aggregate import MaxAggregateOp as _MaxAggregateOp
6
+ from palimpzest.query.operators.aggregate import MinAggregateOp as _MinAggregateOp
7
+ from palimpzest.query.operators.aggregate import SemanticAggregate as _SemanticAggregate
8
+ from palimpzest.query.operators.aggregate import SumAggregateOp as _SumAggregateOp
5
9
  from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
6
10
  from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
7
11
  from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
@@ -47,7 +51,7 @@ from palimpzest.query.operators.logical import (
47
51
  Project as _Project,
48
52
  )
49
53
  from palimpzest.query.operators.logical import (
50
- RetrieveScan as _RetrieveScan,
54
+ TopKScan as _TopKScan,
51
55
  )
52
56
  from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
53
57
  from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsFilter as _MixtureOfAgentsFilter
@@ -55,11 +59,11 @@ from palimpzest.query.operators.physical import PhysicalOperator as _PhysicalOpe
55
59
  from palimpzest.query.operators.project import ProjectOp as _ProjectOp
56
60
  from palimpzest.query.operators.rag import RAGConvert as _RAGConvert
57
61
  from palimpzest.query.operators.rag import RAGFilter as _RAGFilter
58
- from palimpzest.query.operators.retrieve import RetrieveOp as _RetrieveOp
59
62
  from palimpzest.query.operators.scan import MarshalAndScanDataOp as _MarshalAndScanDataOp
60
63
  from palimpzest.query.operators.scan import ScanPhysicalOp as _ScanPhysicalOp
61
64
  from palimpzest.query.operators.split import SplitConvert as _SplitConvert
62
65
  from palimpzest.query.operators.split import SplitFilter as _SplitFilter
66
+ from palimpzest.query.operators.topk import TopKOp as _TopKOp
63
67
 
64
68
  LOGICAL_OPERATORS = [
65
69
  _LogicalOperator,
@@ -72,12 +76,12 @@ LOGICAL_OPERATORS = [
72
76
  _LogicalJoinOp,
73
77
  _LimitScan,
74
78
  _Project,
75
- _RetrieveScan,
79
+ _TopKScan,
76
80
  ]
77
81
 
78
82
  PHYSICAL_OPERATORS = (
79
83
  # aggregate
80
- [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp]
84
+ [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp, _MaxAggregateOp, _MinAggregateOp, _SemanticAggregate, _SumAggregateOp]
81
85
  # convert
82
86
  + [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
83
87
  # critique and refine
@@ -100,8 +104,8 @@ PHYSICAL_OPERATORS = (
100
104
  + [_ProjectOp]
101
105
  # rag
102
106
  + [_RAGConvert, _RAGFilter]
103
- # retrieve
104
- + [_RetrieveOp]
107
+ # top-k
108
+ + [_TopKOp]
105
109
  # split
106
110
  + [_SplitConvert, _SplitFilter]
107
111
  )