palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. palimpzest/__init__.py +7 -9
  2. palimpzest/constants.py +47 -7
  3. palimpzest/core/__init__.py +20 -26
  4. palimpzest/core/data/dataclasses.py +9 -2
  5. palimpzest/core/data/datareaders.py +497 -0
  6. palimpzest/core/elements/records.py +29 -37
  7. palimpzest/core/lib/fields.py +14 -12
  8. palimpzest/core/lib/schemas.py +80 -94
  9. palimpzest/policy.py +58 -0
  10. palimpzest/prompts/__init__.py +22 -0
  11. palimpzest/prompts/code_synthesis_prompts.py +28 -0
  12. palimpzest/prompts/convert_prompts.py +87 -0
  13. palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
  14. palimpzest/prompts/filter_prompts.py +69 -0
  15. palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
  16. palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
  17. palimpzest/prompts/prompt_factory.py +732 -0
  18. palimpzest/prompts/util_phrases.py +14 -0
  19. palimpzest/query/execution/execution_strategy.py +0 -3
  20. palimpzest/query/execution/parallel_execution_strategy.py +12 -25
  21. palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
  22. palimpzest/query/generators/generators.py +71 -347
  23. palimpzest/query/operators/__init__.py +5 -5
  24. palimpzest/query/operators/aggregate.py +10 -5
  25. palimpzest/query/operators/code_synthesis_convert.py +4 -48
  26. palimpzest/query/operators/convert.py +5 -2
  27. palimpzest/query/operators/critique_and_refine_convert.py +112 -0
  28. palimpzest/query/operators/filter.py +1 -1
  29. palimpzest/query/operators/limit.py +1 -1
  30. palimpzest/query/operators/logical.py +28 -27
  31. palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
  32. palimpzest/query/operators/physical.py +32 -20
  33. palimpzest/query/operators/project.py +1 -1
  34. palimpzest/query/operators/rag_convert.py +6 -3
  35. palimpzest/query/operators/retrieve.py +13 -31
  36. palimpzest/query/operators/scan.py +150 -0
  37. palimpzest/query/optimizer/__init__.py +5 -1
  38. palimpzest/query/optimizer/cost_model.py +18 -34
  39. palimpzest/query/optimizer/optimizer.py +40 -25
  40. palimpzest/query/optimizer/optimizer_strategy.py +26 -0
  41. palimpzest/query/optimizer/plan.py +2 -2
  42. palimpzest/query/optimizer/rules.py +118 -27
  43. palimpzest/query/processor/config.py +12 -1
  44. palimpzest/query/processor/mab_sentinel_processor.py +125 -112
  45. palimpzest/query/processor/nosentinel_processor.py +46 -62
  46. palimpzest/query/processor/query_processor.py +10 -20
  47. palimpzest/query/processor/query_processor_factory.py +12 -5
  48. palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
  49. palimpzest/query/processor/streaming_processor.py +11 -17
  50. palimpzest/sets.py +170 -94
  51. palimpzest/tools/pdfparser.py +5 -64
  52. palimpzest/utils/datareader_helpers.py +61 -0
  53. palimpzest/utils/field_helpers.py +69 -0
  54. palimpzest/utils/hash_helpers.py +3 -2
  55. palimpzest/utils/udfs.py +0 -28
  56. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
  57. palimpzest-0.6.1.dist-info/RECORD +87 -0
  58. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
  59. cli/README.md +0 -156
  60. cli/__init__.py +0 -0
  61. cli/cli_main.py +0 -390
  62. palimpzest/config.py +0 -89
  63. palimpzest/core/data/datasources.py +0 -369
  64. palimpzest/datamanager/__init__.py +0 -0
  65. palimpzest/datamanager/datamanager.py +0 -300
  66. palimpzest/prompts.py +0 -397
  67. palimpzest/query/operators/datasource.py +0 -202
  68. palimpzest-0.5.4.dist-info/RECORD +0 -83
  69. palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
  70. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
  71. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,14 @@
1
+ """This file contains utility phrases which are templated into many of our prompts."""
2
+
3
+ ### FORMATTING INSTRUCTIONS ###
4
+ ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
5
+ ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
6
+
7
+ ### REASONING INSTRUCTION FOR IMAGE PROMPTS ###
8
+ COT_REASONING_INSTRUCTION = """Let's think step-by-step in order to answer the question.
9
+
10
+ REASONING: """
11
+
12
+ COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
13
+
14
+ ANSWER: """
@@ -4,7 +4,6 @@ from enum import Enum
4
4
 
5
5
  from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats
6
6
  from palimpzest.core.elements.records import DataRecord
7
- from palimpzest.datamanager.datamanager import DataDirectory
8
7
  from palimpzest.query.optimizer.plan import PhysicalPlan
9
8
 
10
9
 
@@ -23,12 +22,10 @@ class ExecutionStrategy(ABC):
23
22
  """
24
23
  def __init__(self,
25
24
  scan_start_idx: int = 0,
26
- datadir: DataDirectory | None = None,
27
25
  max_workers: int | None = None,
28
26
  nocache: bool = True,
29
27
  verbose: bool = False):
30
28
  self.scan_start_idx = scan_start_idx
31
- self.datadir = datadir
32
29
  self.nocache = nocache
33
30
  self.verbose = verbose
34
31
  self.max_workers = max_workers
@@ -4,13 +4,11 @@ from concurrent.futures import ThreadPoolExecutor, wait
4
4
 
5
5
  from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
6
6
  from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
7
- from palimpzest.core.elements.records import DataRecord
8
- from palimpzest.core.lib.schemas import SourceRecord
9
7
  from palimpzest.query.execution.execution_strategy import ExecutionStrategy
10
8
  from palimpzest.query.operators.aggregate import AggregateOp
11
- from palimpzest.query.operators.datasource import DataSourcePhysicalOp
12
9
  from palimpzest.query.operators.limit import LimitScanOp
13
10
  from palimpzest.query.operators.physical import PhysicalOperator
11
+ from palimpzest.query.operators.scan import ScanPhysicalOp
14
12
  from palimpzest.query.optimizer.plan import PhysicalPlan
15
13
 
16
14
 
@@ -72,12 +70,11 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
72
70
  }
73
71
  op_id_to_op_idx = {op.get_op_id(): idx for idx, op in enumerate(plan.operators)}
74
72
 
75
- # get handle to DataSource and pre-compute its op_id and size
73
+ # get handle to scan operator and pre-compute its op_id and size
76
74
  source_operator = plan.operators[0]
77
- assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
75
+ assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
78
76
  source_op_id = source_operator.get_op_id()
79
- datasource = source_operator.get_datasource()
80
- datasource_len = len(datasource)
77
+ datareader_len = len(source_operator.datareader)
81
78
 
82
79
  # get limit of final limit operator (if one exists)
83
80
  final_limit = plan.operators[-1].limit if isinstance(plan.operators[-1], LimitScanOp) else None
@@ -87,13 +84,7 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
87
84
  current_scan_idx = self.scan_start_idx
88
85
  with ThreadPoolExecutor(max_workers=plan_workers) as executor:
89
86
  # create initial (set of) future(s) to read first source record;
90
- # construct input DataRecord for DataSourcePhysicalOp
91
- # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
92
- # it is simply a vessel to inform the scan_operator which record to fetch
93
- candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
94
- candidate.idx = current_scan_idx
95
- candidate.get_item_fn = datasource.get_item
96
- futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
87
+ futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
97
88
  op_id_to_futures_in_flight[source_op_id] += 1
98
89
  current_scan_idx += 1
99
90
 
@@ -131,7 +122,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
131
122
 
132
123
  # add records (which are not filtered) to the cache, if allowed
133
124
  if not self.nocache:
134
- self.datadir.append_cache(operator.target_cache_id, record)
125
+ # self.datadir.append_cache(operator.target_cache_id, record)
126
+ pass
135
127
 
136
128
  # add records to processing queue if there is a next_operator; otherwise add to output_records
137
129
  next_operator = op_id_to_next_operator[op_id]
@@ -145,14 +137,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
145
137
  source_records_scanned += len(record_set)
146
138
 
147
139
  # scan next record if we can still draw records from source
148
- if source_records_scanned < num_samples and current_scan_idx < datasource_len:
149
- # construct input DataRecord for DataSourcePhysicalOp
150
- # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
151
- # it is simply a vessel to inform the scan_operator which record to fetch
152
- candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
153
- candidate.idx = current_scan_idx
154
- candidate.get_item_fn = datasource.get_item
155
- new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
140
+ if source_records_scanned < num_samples and current_scan_idx < datareader_len:
141
+ new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
156
142
  op_id_to_futures_in_flight[source_op_id] += 1
157
143
  current_scan_idx += 1
158
144
 
@@ -217,8 +203,9 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
217
203
 
218
204
  # if caching was allowed, close the cache
219
205
  if not self.nocache:
220
- for operator in plan.operators:
221
- self.datadir.close_cache(operator.target_cache_id)
206
+ for _ in plan.operators:
207
+ # self.datadir.close_cache(operator.target_cache_id)
208
+ pass
222
209
 
223
210
  # finalize plan stats
224
211
  total_plan_time = time.time() - plan_start_time
@@ -1,13 +1,11 @@
1
1
  import time
2
2
 
3
3
  from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
4
- from palimpzest.core.elements.records import DataRecord
5
- from palimpzest.core.lib.schemas import SourceRecord
6
4
  from palimpzest.query.execution.execution_strategy import ExecutionStrategy
7
5
  from palimpzest.query.operators.aggregate import AggregateOp
8
- from palimpzest.query.operators.datasource import DataSourcePhysicalOp
9
6
  from palimpzest.query.operators.filter import FilterOp
10
7
  from palimpzest.query.operators.limit import LimitScanOp
8
+ from palimpzest.query.operators.scan import ScanPhysicalOp
11
9
  from palimpzest.query.optimizer.plan import PhysicalPlan
12
10
 
13
11
 
@@ -46,14 +44,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
46
44
  output_records = []
47
45
  current_scan_idx = self.scan_start_idx
48
46
 
49
- # get handle to DataSource and pre-compute its size
47
+ # get handle to scan operator and pre-compute its size
50
48
  source_operator = plan.operators[0]
51
- assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
52
- datasource = source_operator.get_datasource()
53
- datasource_len = len(datasource)
49
+ assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
50
+ datareader_len = len(source_operator.datareader)
54
51
 
55
52
  # initialize processing queues for each operation
56
- processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, DataSourcePhysicalOp)}
53
+ processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
57
54
 
58
55
  # execute the plan one operator at a time
59
56
  for op_idx, operator in enumerate(plan.operators):
@@ -64,19 +61,12 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
64
61
  # initialize output records and record_op_stats for this operator
65
62
  records, record_op_stats = [], []
66
63
 
67
- # invoke datasource operator(s) until we run out of source records or hit the num_samples limit
68
- if isinstance(operator, DataSourcePhysicalOp):
64
+ # invoke scan operator(s) until we run out of source records or hit the num_samples limit
65
+ if isinstance(operator, ScanPhysicalOp):
69
66
  keep_scanning_source_records = True
70
67
  while keep_scanning_source_records:
71
- # construct input DataRecord for DataSourcePhysicalOp
72
- # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
73
- # it is simply a vessel to inform the scan_operator which record to fetch
74
- candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
75
- candidate.idx = current_scan_idx
76
- candidate.get_item_fn = datasource.get_item
77
-
78
- # run DataSourcePhysicalOp on record
79
- record_set = operator(candidate)
68
+ # run ScanPhysicalOp on current scan index
69
+ record_set = operator(current_scan_idx)
80
70
  records.extend(record_set.data_records)
81
71
  record_op_stats.extend(record_set.record_op_stats)
82
72
 
@@ -84,7 +74,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
84
74
  current_scan_idx += 1
85
75
 
86
76
  # update whether to keep scanning source records
87
- keep_scanning_source_records = current_scan_idx < datasource_len and len(records) < num_samples
77
+ keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
88
78
 
89
79
  # aggregate operators accept all input records at once
90
80
  elif isinstance(operator, AggregateOp):
@@ -113,7 +103,8 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
113
103
  if not self.nocache:
114
104
  for record in records:
115
105
  if getattr(record, "passed_operator", True):
116
- self.datadir.append_cache(operator.target_cache_id, record)
106
+ # self.datadir.append_cache(operator.target_cache_id, record)
107
+ pass
117
108
 
118
109
  # update processing_queues or output_records
119
110
  for record in records:
@@ -130,8 +121,9 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
130
121
 
131
122
  # if caching was allowed, close the cache
132
123
  if not self.nocache:
133
- for operator in plan.operators:
134
- self.datadir.close_cache(operator.target_cache_id)
124
+ for _ in plan.operators:
125
+ # self.datadir.close_cache(operator.target_cache_id)
126
+ pass
135
127
 
136
128
  # finalize plan stats
137
129
  total_plan_time = time.time() - plan_start_time
@@ -181,14 +173,13 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
181
173
  source_records_scanned = 0
182
174
  current_scan_idx = self.scan_start_idx
183
175
 
184
- # get handle to DataSource and pre-compute its size
176
+ # get handle to scan operator and pre-compute its size
185
177
  source_operator = plan.operators[0]
186
- assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
187
- datasource = source_operator.get_datasource()
188
- datasource_len = len(datasource)
178
+ assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
179
+ datareader_len = len(source_operator.datareader)
189
180
 
190
181
  # initialize processing queues for each operation
191
- processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, DataSourcePhysicalOp)}
182
+ processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
192
183
 
193
184
  # execute the plan until either:
194
185
  # 1. all records have been processed, or
@@ -204,18 +195,11 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
204
195
  # create empty lists for records and execution stats generated by executing this operator on its next input(s)
205
196
  records, record_op_stats = [], []
206
197
 
207
- # invoke datasource operator(s) until we run out of source records or hit the num_samples limit
208
- if isinstance(operator, DataSourcePhysicalOp):
198
+ # invoke scan operator(s) until we run out of source records or hit the num_samples limit
199
+ if isinstance(operator, ScanPhysicalOp):
209
200
  if keep_scanning_source_records:
210
- # construct input DataRecord for DataSourcePhysicalOp
211
- # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
212
- # it is simply a vessel to inform the scan_operator which record to fetch
213
- candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
214
- candidate.idx = current_scan_idx
215
- candidate.get_item_fn = datasource.get_item
216
-
217
- # run DataSourcePhysicalOp on record
218
- record_set = operator(candidate)
201
+ # run ScanPhysicalOp on current scan index
202
+ record_set = operator(current_scan_idx)
219
203
  records = record_set.data_records
220
204
  record_op_stats = record_set.record_op_stats
221
205
 
@@ -230,8 +214,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
230
214
  elif isinstance(operator, AggregateOp):
231
215
  upstream_ops_are_finished = True
232
216
  for upstream_op_idx in range(op_idx):
233
- # datasources do not have processing queues
234
- if isinstance(plan.operators[upstream_op_idx], DataSourcePhysicalOp):
217
+ # scan operators do not have processing queues
218
+ if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
235
219
  continue
236
220
 
237
221
  # check upstream ops which do have a processing queue
@@ -266,7 +250,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
266
250
  if not self.nocache:
267
251
  for record in records:
268
252
  if getattr(record, "passed_operator", True):
269
- self.datadir.append_cache(operator.target_cache_id, record)
253
+ # self.datadir.append_cache(operator.target_cache_id, record)
254
+ pass
270
255
 
271
256
  # update processing_queues or output_records
272
257
  for record in records:
@@ -279,7 +264,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
279
264
 
280
265
  # update finished_executing based on whether all records have been processed
281
266
  still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
282
- keep_scanning_source_records = current_scan_idx < datasource_len and source_records_scanned < num_samples
267
+ keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
283
268
  finished_executing = not keep_scanning_source_records and not still_processing
284
269
 
285
270
  # update finished_executing based on limit
@@ -288,8 +273,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
288
273
 
289
274
  # if caching was allowed, close the cache
290
275
  if not self.nocache:
291
- for operator in plan.operators:
292
- self.datadir.close_cache(operator.target_cache_id)
276
+ for _ in plan.operators:
277
+ # self.datadir.close_cache(operator.target_cache_id)
278
+ pass
293
279
 
294
280
  # finalize plan stats
295
281
  total_plan_time = time.time() - plan_start_time