palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.20.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,13 @@
1
1
  import logging
2
2
 
3
- from palimpzest.core.data.dataclasses import PlanStats
4
3
  from palimpzest.core.elements.records import DataRecord
4
+ from palimpzest.core.models import PlanStats
5
5
  from palimpzest.query.execution.execution_strategy import ExecutionStrategy
6
6
  from palimpzest.query.operators.aggregate import AggregateOp
7
+ from palimpzest.query.operators.join import JoinOp
7
8
  from palimpzest.query.operators.limit import LimitScanOp
8
- from palimpzest.query.operators.scan import ScanPhysicalOp
9
+ from palimpzest.query.operators.physical import PhysicalOperator
10
+ from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
9
11
  from palimpzest.query.optimizer.plan import PhysicalPlan
10
12
  from palimpzest.utils.progress import create_progress_manager
11
13
 
@@ -25,61 +27,80 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
25
27
  super().__init__(*args, **kwargs)
26
28
  self.max_workers = 1
27
29
 
28
- def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
30
+ def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, dict[str, list]], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
29
31
  # execute the plan one operator at a time
30
32
  output_records = []
31
- for op_idx, operator in enumerate(plan.operators):
33
+ for topo_idx, operator in enumerate(plan):
32
34
  # if we've filtered out all records, terminate early
33
- full_op_id = operator.get_full_op_id()
34
- num_inputs = len(input_queues[full_op_id])
35
+ source_unique_full_op_ids = (
36
+ [f"source_{operator.get_full_op_id()}"]
37
+ if isinstance(operator, (ContextScanOp, ScanPhysicalOp))
38
+ else plan.get_source_unique_full_op_ids(topo_idx, operator)
39
+ )
40
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
41
+ num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
35
42
  if num_inputs == 0:
36
43
  break
37
44
 
38
45
  # begin to process this operator
39
46
  records, record_op_stats = [], []
40
- logger.info(f"Processing operator {operator.op_name()} ({full_op_id})")
47
+ logger.info(f"Processing operator {operator.op_name()} ({unique_full_op_id})")
41
48
 
42
49
  # if this operator is an aggregate, process all the records in the input_queue
43
50
  if isinstance(operator, AggregateOp):
44
- record_set = operator(candidates=input_queues[full_op_id])
51
+ source_unique_full_op_id = source_unique_full_op_ids[0]
52
+ record_set = operator(candidates=input_queues[unique_full_op_id][source_unique_full_op_id])
45
53
  records = record_set.data_records
46
54
  record_op_stats = record_set.record_op_stats
47
55
  num_outputs = sum(record.passed_operator for record in records)
48
56
 
49
57
  # update the progress manager
50
- self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
58
+ self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
59
+
60
+ # if this operator is a join, process all pairs of records from the two input queues
61
+ elif isinstance(operator, JoinOp):
62
+ left_full_source_op_id = source_unique_full_op_ids[0]
63
+ left_num_inputs = len(input_queues[unique_full_op_id][left_full_source_op_id])
64
+ left_input_records = [input_queues[unique_full_op_id][left_full_source_op_id].pop(0) for _ in range(left_num_inputs)]
65
+
66
+ right_full_source_op_id = source_unique_full_op_ids[1]
67
+ right_num_inputs = len(input_queues[unique_full_op_id][right_full_source_op_id])
68
+ right_input_records = [input_queues[unique_full_op_id][right_full_source_op_id].pop(0) for _ in range(right_num_inputs)]
69
+
70
+ record_set, num_inputs_processed = operator(left_input_records, right_input_records)
71
+ records = record_set.data_records
72
+ record_op_stats = record_set.record_op_stats
73
+ num_outputs = sum(record.passed_operator for record in records)
74
+
75
+ # update the progress manager
76
+ self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
51
77
 
52
78
  # otherwise, process the records in the input queue for this operator one at a time
53
79
  else:
54
- for input_record in input_queues[full_op_id]:
80
+ source_unique_full_op_id = source_unique_full_op_ids[0]
81
+ for input_record in input_queues[unique_full_op_id][source_unique_full_op_id]:
55
82
  record_set = operator(input_record)
56
83
  records.extend(record_set.data_records)
57
84
  record_op_stats.extend(record_set.record_op_stats)
58
85
  num_outputs = sum(record.passed_operator for record in record_set.data_records)
59
86
 
60
87
  # update the progress manager
61
- self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
88
+ self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
62
89
 
63
90
  # finish early if this is a limit
64
91
  if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
65
92
  break
66
93
 
67
94
  # update plan stats
68
- plan_stats.add_record_op_stats(record_op_stats)
69
-
70
- # add records to the cache
71
- self._add_records_to_cache(operator.target_cache_id, records)
95
+ plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
72
96
 
73
97
  # update next input_queue (if it exists)
74
- output_records = [record for record in records if record.passed_operator]
75
- if op_idx + 1 < len(plan.operators):
76
- next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
77
- input_queues[next_full_op_id] = output_records
98
+ output_records = [record for record in records if record.passed_operator]
99
+ next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
100
+ if next_unique_full_op_id is not None:
101
+ input_queues[next_unique_full_op_id][unique_full_op_id] = output_records
78
102
 
79
- logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}), and generated {len(records)} records")
80
-
81
- # close the cache
82
- self._close_cache([op.target_cache_id for op in plan.operators])
103
+ logger.info(f"Finished processing operator {operator.op_name()} ({unique_full_op_id}), and generated {len(records)} records")
83
104
 
84
105
  # finalize plan stats
85
106
  plan_stats.finish()
@@ -88,8 +109,6 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
88
109
 
89
110
  def execute_plan(self, plan: PhysicalPlan) -> tuple[list[DataRecord], PlanStats]:
90
111
  """Initialize the stats and execute the plan."""
91
- # for now, assert that the first operator in the plan is a ScanPhysicalOp
92
- assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
93
112
  logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
94
113
  logger.info(f"Plan Details: {plan}")
95
114
 
@@ -104,7 +123,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
104
123
  self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
105
124
  self.progress_manager.start()
106
125
 
107
- # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
126
+ # NOTE: we must handle progress manager outside of _execute_plan to ensure that it is shut down correctly;
108
127
  # if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
109
128
  # because the progress manager cannot get a handle to the console
110
129
  try:
@@ -139,31 +158,43 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
139
158
  super().__init__(*args, **kwargs)
140
159
  self.max_workers = 1
141
160
 
142
- def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
161
+ def _any_queue_not_empty(self, queues: dict[str, list] | dict[str, dict[str, list]]) -> bool:
143
162
  """Helper function to check if any queue is not empty."""
144
- return any(len(queue) > 0 for queue in queues.values())
145
-
146
- def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list]) -> bool:
147
- """Helper function to check if all upstream operators have finished processing their inputs."""
148
- for upstream_op_idx in range(op_idx):
149
- upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
150
- if len(input_queues[upstream_full_op_id]) > 0:
151
- return False
152
-
153
- return True
154
-
155
- def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
163
+ for _, value in queues.items():
164
+ if isinstance(value, dict):
165
+ if any(len(subqueue) > 0 for subqueue in value.values()):
166
+ return True
167
+ elif len(value) > 0:
168
+ return True
169
+ return False
170
+
171
+ def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]]) -> bool:
172
+ """Helper function to check if agg / join operator is ready to process its inputs."""
173
+ # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
174
+ upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
175
+ upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
176
+ return not self._any_queue_not_empty(upstream_input_queues)
177
+
178
+
179
+ def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, dict[str, list]], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
156
180
  # execute the plan until either:
157
181
  # 1. all records have been processed, or
158
182
  # 2. the final limit operation has completed (we break out of the loop if this happens)
159
183
  final_output_records = []
160
184
  while self._any_queue_not_empty(input_queues):
161
- for op_idx, operator in enumerate(plan.operators):
185
+ for topo_idx, operator in enumerate(plan):
162
186
  # if this operator does not have enough inputs to execute, then skip it
163
- full_op_id = operator.get_full_op_id()
164
- num_inputs = len(input_queues[full_op_id])
165
- agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues)
166
- if num_inputs == 0 or agg_op_not_ready:
187
+ source_unique_full_op_ids = (
188
+ [f"source_{operator.get_full_op_id()}"]
189
+ if isinstance(operator, (ContextScanOp, ScanPhysicalOp))
190
+ else plan.get_source_unique_full_op_ids(topo_idx, operator)
191
+ )
192
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
193
+
194
+ num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
195
+ agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
196
+ join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
197
+ if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
167
198
  continue
168
199
 
169
200
  # create empty lists for records and execution stats generated by executing this operator on its next input(s)
@@ -171,49 +202,63 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
171
202
 
172
203
  # if the next operator is an aggregate, process all the records in the input_queue
173
204
  if isinstance(operator, AggregateOp):
174
- input_records = [input_queues[full_op_id].pop(0) for _ in range(num_inputs)]
205
+ source_unique_full_op_id = source_unique_full_op_ids[0]
206
+ input_records = [input_queues[unique_full_op_id][source_unique_full_op_id].pop(0) for _ in range(num_inputs)]
175
207
  record_set = operator(candidates=input_records)
176
208
  records = record_set.data_records
177
209
  record_op_stats = record_set.record_op_stats
178
210
  num_outputs = sum(record.passed_operator for record in records)
179
211
 
180
212
  # update the progress manager
181
- self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
213
+ self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
214
+
215
+ # if this operator is a join, process all pairs of records from the two input queues
216
+ elif isinstance(operator, JoinOp):
217
+ left_full_source_op_id = source_unique_full_op_ids[0]
218
+ left_num_inputs = len(input_queues[unique_full_op_id][left_full_source_op_id])
219
+ left_input_records = [input_queues[unique_full_op_id][left_full_source_op_id].pop(0) for _ in range(left_num_inputs)]
220
+
221
+ right_full_source_op_id = source_unique_full_op_ids[1]
222
+ right_num_inputs = len(input_queues[unique_full_op_id][right_full_source_op_id])
223
+ right_input_records = [input_queues[unique_full_op_id][right_full_source_op_id].pop(0) for _ in range(right_num_inputs)]
224
+
225
+ record_set, num_inputs_processed = operator(left_input_records, right_input_records)
226
+ records = record_set.data_records
227
+ record_op_stats = record_set.record_op_stats
228
+ num_outputs = sum(record.passed_operator for record in records)
229
+
230
+ # update the progress manager
231
+ self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
182
232
 
183
233
  # otherwise, process the next record in the input queue for this operator
184
234
  else:
185
- input_record = input_queues[full_op_id].pop(0)
235
+ source_unique_full_op_id = source_unique_full_op_ids[0]
236
+ input_record = input_queues[unique_full_op_id][source_unique_full_op_id].pop(0)
186
237
  record_set = operator(input_record)
187
238
  records = record_set.data_records
188
239
  record_op_stats = record_set.record_op_stats
189
240
  num_outputs = sum(record.passed_operator for record in records)
190
241
 
191
242
  # update the progress manager
192
- self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
243
+ self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
193
244
 
194
245
  # update plan stats
195
- plan_stats.add_record_op_stats(record_op_stats)
196
-
197
- # add records to the cache
198
- self._add_records_to_cache(operator.target_cache_id, records)
246
+ plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
199
247
 
200
248
  # update next input_queue or final_output_records
201
- output_records = [record for record in records if record.passed_operator]
202
- if op_idx + 1 < len(plan.operators):
203
- next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
204
- input_queues[next_full_op_id].extend(output_records)
249
+ output_records = [record for record in records if record.passed_operator]
250
+ next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
251
+ if next_unique_full_op_id is not None:
252
+ input_queues[next_unique_full_op_id][unique_full_op_id].extend(output_records)
205
253
  else:
206
254
  final_output_records.extend(output_records)
207
255
 
208
- logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}) on {num_inputs} records")
256
+ logger.info(f"Finished processing operator {operator.op_name()} ({unique_full_op_id}) on {num_inputs} records")
209
257
 
210
258
  # break out of loop if the final operator is a LimitScanOp and we've reached its limit
211
- if isinstance(plan.operators[-1], LimitScanOp) and len(final_output_records) == plan.operators[-1].limit:
259
+ if isinstance(plan.operator, LimitScanOp) and len(final_output_records) == plan.operator.limit:
212
260
  break
213
261
 
214
- # close the cache
215
- self._close_cache([op.target_cache_id for op in plan.operators])
216
-
217
262
  # finalize plan stats
218
263
  plan_stats.finish()
219
264
 
@@ -221,8 +266,6 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
221
266
 
222
267
  def execute_plan(self, plan: PhysicalPlan):
223
268
  """Initialize the stats and execute the plan."""
224
- # for now, assert that the first operator in the plan is a ScanPhysicalOp
225
- assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
226
269
  logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
227
270
  logger.info(f"Plan Details: {plan}")
228
271
 
@@ -237,7 +280,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
237
280
  self.progress_manager = create_progress_manager(plan, self.num_samples, self.progress)
238
281
  self.progress_manager.start()
239
282
 
240
- # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
283
+ # NOTE: we must handle progress manager outside of _execute_plan to ensure that it is shut down correctly;
241
284
  # if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
242
285
  # because the progress manager cannot get a handle to the console
243
286
  try: