palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +7 -9
- palimpzest/constants.py +47 -7
- palimpzest/core/__init__.py +20 -26
- palimpzest/core/data/dataclasses.py +9 -2
- palimpzest/core/data/datareaders.py +497 -0
- palimpzest/core/elements/records.py +29 -37
- palimpzest/core/lib/fields.py +14 -12
- palimpzest/core/lib/schemas.py +80 -94
- palimpzest/policy.py +58 -0
- palimpzest/prompts/__init__.py +22 -0
- palimpzest/prompts/code_synthesis_prompts.py +28 -0
- palimpzest/prompts/convert_prompts.py +87 -0
- palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
- palimpzest/prompts/filter_prompts.py +69 -0
- palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
- palimpzest/prompts/prompt_factory.py +732 -0
- palimpzest/prompts/util_phrases.py +14 -0
- palimpzest/query/execution/execution_strategy.py +0 -3
- palimpzest/query/execution/parallel_execution_strategy.py +12 -25
- palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
- palimpzest/query/generators/generators.py +71 -347
- palimpzest/query/operators/__init__.py +5 -5
- palimpzest/query/operators/aggregate.py +10 -5
- palimpzest/query/operators/code_synthesis_convert.py +4 -48
- palimpzest/query/operators/convert.py +5 -2
- palimpzest/query/operators/critique_and_refine_convert.py +112 -0
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/logical.py +28 -27
- palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
- palimpzest/query/operators/physical.py +32 -20
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/rag_convert.py +6 -3
- palimpzest/query/operators/retrieve.py +13 -31
- palimpzest/query/operators/scan.py +150 -0
- palimpzest/query/optimizer/__init__.py +5 -1
- palimpzest/query/optimizer/cost_model.py +18 -34
- palimpzest/query/optimizer/optimizer.py +40 -25
- palimpzest/query/optimizer/optimizer_strategy.py +26 -0
- palimpzest/query/optimizer/plan.py +2 -2
- palimpzest/query/optimizer/rules.py +118 -27
- palimpzest/query/processor/config.py +12 -1
- palimpzest/query/processor/mab_sentinel_processor.py +125 -112
- palimpzest/query/processor/nosentinel_processor.py +46 -62
- palimpzest/query/processor/query_processor.py +10 -20
- palimpzest/query/processor/query_processor_factory.py +12 -5
- palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
- palimpzest/query/processor/streaming_processor.py +11 -17
- palimpzest/sets.py +170 -94
- palimpzest/tools/pdfparser.py +5 -64
- palimpzest/utils/datareader_helpers.py +61 -0
- palimpzest/utils/field_helpers.py +69 -0
- palimpzest/utils/hash_helpers.py +3 -2
- palimpzest/utils/udfs.py +0 -28
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
- palimpzest-0.6.1.dist-info/RECORD +87 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
- cli/README.md +0 -156
- cli/__init__.py +0 -0
- cli/cli_main.py +0 -390
- palimpzest/config.py +0 -89
- palimpzest/core/data/datasources.py +0 -369
- palimpzest/datamanager/__init__.py +0 -0
- palimpzest/datamanager/datamanager.py +0 -300
- palimpzest/prompts.py +0 -397
- palimpzest/query/operators/datasource.py +0 -202
- palimpzest-0.5.4.dist-info/RECORD +0 -83
- palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""This file contains utility phrases which are templated into many of our prompts."""
|
|
2
|
+
|
|
3
|
+
### FORMATTING INSTRUCTIONS ###
|
|
4
|
+
ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
|
|
5
|
+
ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
|
|
6
|
+
|
|
7
|
+
### REASONING INSTRUCTION FOR IMAGE PROMPTS ###
|
|
8
|
+
COT_REASONING_INSTRUCTION = """Let's think step-by-step in order to answer the question.
|
|
9
|
+
|
|
10
|
+
REASONING: """
|
|
11
|
+
|
|
12
|
+
COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
|
|
13
|
+
|
|
14
|
+
ANSWER: """
|
|
@@ -4,7 +4,6 @@ from enum import Enum
|
|
|
4
4
|
|
|
5
5
|
from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats
|
|
6
6
|
from palimpzest.core.elements.records import DataRecord
|
|
7
|
-
from palimpzest.datamanager.datamanager import DataDirectory
|
|
8
7
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
9
8
|
|
|
10
9
|
|
|
@@ -23,12 +22,10 @@ class ExecutionStrategy(ABC):
|
|
|
23
22
|
"""
|
|
24
23
|
def __init__(self,
|
|
25
24
|
scan_start_idx: int = 0,
|
|
26
|
-
datadir: DataDirectory | None = None,
|
|
27
25
|
max_workers: int | None = None,
|
|
28
26
|
nocache: bool = True,
|
|
29
27
|
verbose: bool = False):
|
|
30
28
|
self.scan_start_idx = scan_start_idx
|
|
31
|
-
self.datadir = datadir
|
|
32
29
|
self.nocache = nocache
|
|
33
30
|
self.verbose = verbose
|
|
34
31
|
self.max_workers = max_workers
|
|
@@ -4,13 +4,11 @@ from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
4
4
|
|
|
5
5
|
from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
|
|
6
6
|
from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
|
|
7
|
-
from palimpzest.core.elements.records import DataRecord
|
|
8
|
-
from palimpzest.core.lib.schemas import SourceRecord
|
|
9
7
|
from palimpzest.query.execution.execution_strategy import ExecutionStrategy
|
|
10
8
|
from palimpzest.query.operators.aggregate import AggregateOp
|
|
11
|
-
from palimpzest.query.operators.datasource import DataSourcePhysicalOp
|
|
12
9
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
13
10
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
11
|
+
from palimpzest.query.operators.scan import ScanPhysicalOp
|
|
14
12
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
15
13
|
|
|
16
14
|
|
|
@@ -72,12 +70,11 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
|
|
|
72
70
|
}
|
|
73
71
|
op_id_to_op_idx = {op.get_op_id(): idx for idx, op in enumerate(plan.operators)}
|
|
74
72
|
|
|
75
|
-
# get handle to
|
|
73
|
+
# get handle to scan operator and pre-compute its op_id and size
|
|
76
74
|
source_operator = plan.operators[0]
|
|
77
|
-
assert isinstance(source_operator,
|
|
75
|
+
assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
78
76
|
source_op_id = source_operator.get_op_id()
|
|
79
|
-
|
|
80
|
-
datasource_len = len(datasource)
|
|
77
|
+
datareader_len = len(source_operator.datareader)
|
|
81
78
|
|
|
82
79
|
# get limit of final limit operator (if one exists)
|
|
83
80
|
final_limit = plan.operators[-1].limit if isinstance(plan.operators[-1], LimitScanOp) else None
|
|
@@ -87,13 +84,7 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
|
|
|
87
84
|
current_scan_idx = self.scan_start_idx
|
|
88
85
|
with ThreadPoolExecutor(max_workers=plan_workers) as executor:
|
|
89
86
|
# create initial (set of) future(s) to read first source record;
|
|
90
|
-
|
|
91
|
-
# NOTE: this DataRecord will be discarded and replaced by the scan_operator;
|
|
92
|
-
# it is simply a vessel to inform the scan_operator which record to fetch
|
|
93
|
-
candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
|
|
94
|
-
candidate.idx = current_scan_idx
|
|
95
|
-
candidate.get_item_fn = datasource.get_item
|
|
96
|
-
futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
|
|
87
|
+
futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
|
|
97
88
|
op_id_to_futures_in_flight[source_op_id] += 1
|
|
98
89
|
current_scan_idx += 1
|
|
99
90
|
|
|
@@ -131,7 +122,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
|
|
|
131
122
|
|
|
132
123
|
# add records (which are not filtered) to the cache, if allowed
|
|
133
124
|
if not self.nocache:
|
|
134
|
-
self.datadir.append_cache(operator.target_cache_id, record)
|
|
125
|
+
# self.datadir.append_cache(operator.target_cache_id, record)
|
|
126
|
+
pass
|
|
135
127
|
|
|
136
128
|
# add records to processing queue if there is a next_operator; otherwise add to output_records
|
|
137
129
|
next_operator = op_id_to_next_operator[op_id]
|
|
@@ -145,14 +137,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
|
|
|
145
137
|
source_records_scanned += len(record_set)
|
|
146
138
|
|
|
147
139
|
# scan next record if we can still draw records from source
|
|
148
|
-
if source_records_scanned < num_samples and current_scan_idx <
|
|
149
|
-
|
|
150
|
-
# NOTE: this DataRecord will be discarded and replaced by the scan_operator;
|
|
151
|
-
# it is simply a vessel to inform the scan_operator which record to fetch
|
|
152
|
-
candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
|
|
153
|
-
candidate.idx = current_scan_idx
|
|
154
|
-
candidate.get_item_fn = datasource.get_item
|
|
155
|
-
new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
|
|
140
|
+
if source_records_scanned < num_samples and current_scan_idx < datareader_len:
|
|
141
|
+
new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
|
|
156
142
|
op_id_to_futures_in_flight[source_op_id] += 1
|
|
157
143
|
current_scan_idx += 1
|
|
158
144
|
|
|
@@ -217,8 +203,9 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
|
|
|
217
203
|
|
|
218
204
|
# if caching was allowed, close the cache
|
|
219
205
|
if not self.nocache:
|
|
220
|
-
for
|
|
221
|
-
self.datadir.close_cache(operator.target_cache_id)
|
|
206
|
+
for _ in plan.operators:
|
|
207
|
+
# self.datadir.close_cache(operator.target_cache_id)
|
|
208
|
+
pass
|
|
222
209
|
|
|
223
210
|
# finalize plan stats
|
|
224
211
|
total_plan_time = time.time() - plan_start_time
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import time
|
|
2
2
|
|
|
3
3
|
from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
|
|
4
|
-
from palimpzest.core.elements.records import DataRecord
|
|
5
|
-
from palimpzest.core.lib.schemas import SourceRecord
|
|
6
4
|
from palimpzest.query.execution.execution_strategy import ExecutionStrategy
|
|
7
5
|
from palimpzest.query.operators.aggregate import AggregateOp
|
|
8
|
-
from palimpzest.query.operators.datasource import DataSourcePhysicalOp
|
|
9
6
|
from palimpzest.query.operators.filter import FilterOp
|
|
10
7
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
8
|
+
from palimpzest.query.operators.scan import ScanPhysicalOp
|
|
11
9
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
12
10
|
|
|
13
11
|
|
|
@@ -46,14 +44,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
46
44
|
output_records = []
|
|
47
45
|
current_scan_idx = self.scan_start_idx
|
|
48
46
|
|
|
49
|
-
# get handle to
|
|
47
|
+
# get handle to scan operator and pre-compute its size
|
|
50
48
|
source_operator = plan.operators[0]
|
|
51
|
-
assert isinstance(source_operator,
|
|
52
|
-
|
|
53
|
-
datasource_len = len(datasource)
|
|
49
|
+
assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
50
|
+
datareader_len = len(source_operator.datareader)
|
|
54
51
|
|
|
55
52
|
# initialize processing queues for each operation
|
|
56
|
-
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op,
|
|
53
|
+
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
|
|
57
54
|
|
|
58
55
|
# execute the plan one operator at a time
|
|
59
56
|
for op_idx, operator in enumerate(plan.operators):
|
|
@@ -64,19 +61,12 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
64
61
|
# initialize output records and record_op_stats for this operator
|
|
65
62
|
records, record_op_stats = [], []
|
|
66
63
|
|
|
67
|
-
# invoke
|
|
68
|
-
if isinstance(operator,
|
|
64
|
+
# invoke scan operator(s) until we run out of source records or hit the num_samples limit
|
|
65
|
+
if isinstance(operator, ScanPhysicalOp):
|
|
69
66
|
keep_scanning_source_records = True
|
|
70
67
|
while keep_scanning_source_records:
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
# it is simply a vessel to inform the scan_operator which record to fetch
|
|
74
|
-
candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
|
|
75
|
-
candidate.idx = current_scan_idx
|
|
76
|
-
candidate.get_item_fn = datasource.get_item
|
|
77
|
-
|
|
78
|
-
# run DataSourcePhysicalOp on record
|
|
79
|
-
record_set = operator(candidate)
|
|
68
|
+
# run ScanPhysicalOp on current scan index
|
|
69
|
+
record_set = operator(current_scan_idx)
|
|
80
70
|
records.extend(record_set.data_records)
|
|
81
71
|
record_op_stats.extend(record_set.record_op_stats)
|
|
82
72
|
|
|
@@ -84,7 +74,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
84
74
|
current_scan_idx += 1
|
|
85
75
|
|
|
86
76
|
# update whether to keep scanning source records
|
|
87
|
-
keep_scanning_source_records = current_scan_idx <
|
|
77
|
+
keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
|
|
88
78
|
|
|
89
79
|
# aggregate operators accept all input records at once
|
|
90
80
|
elif isinstance(operator, AggregateOp):
|
|
@@ -113,7 +103,8 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
113
103
|
if not self.nocache:
|
|
114
104
|
for record in records:
|
|
115
105
|
if getattr(record, "passed_operator", True):
|
|
116
|
-
self.datadir.append_cache(operator.target_cache_id, record)
|
|
106
|
+
# self.datadir.append_cache(operator.target_cache_id, record)
|
|
107
|
+
pass
|
|
117
108
|
|
|
118
109
|
# update processing_queues or output_records
|
|
119
110
|
for record in records:
|
|
@@ -130,8 +121,9 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
130
121
|
|
|
131
122
|
# if caching was allowed, close the cache
|
|
132
123
|
if not self.nocache:
|
|
133
|
-
for
|
|
134
|
-
self.datadir.close_cache(operator.target_cache_id)
|
|
124
|
+
for _ in plan.operators:
|
|
125
|
+
# self.datadir.close_cache(operator.target_cache_id)
|
|
126
|
+
pass
|
|
135
127
|
|
|
136
128
|
# finalize plan stats
|
|
137
129
|
total_plan_time = time.time() - plan_start_time
|
|
@@ -181,14 +173,13 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
181
173
|
source_records_scanned = 0
|
|
182
174
|
current_scan_idx = self.scan_start_idx
|
|
183
175
|
|
|
184
|
-
# get handle to
|
|
176
|
+
# get handle to scan operator and pre-compute its size
|
|
185
177
|
source_operator = plan.operators[0]
|
|
186
|
-
assert isinstance(source_operator,
|
|
187
|
-
|
|
188
|
-
datasource_len = len(datasource)
|
|
178
|
+
assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
179
|
+
datareader_len = len(source_operator.datareader)
|
|
189
180
|
|
|
190
181
|
# initialize processing queues for each operation
|
|
191
|
-
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op,
|
|
182
|
+
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
|
|
192
183
|
|
|
193
184
|
# execute the plan until either:
|
|
194
185
|
# 1. all records have been processed, or
|
|
@@ -204,18 +195,11 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
204
195
|
# create empty lists for records and execution stats generated by executing this operator on its next input(s)
|
|
205
196
|
records, record_op_stats = [], []
|
|
206
197
|
|
|
207
|
-
# invoke
|
|
208
|
-
if isinstance(operator,
|
|
198
|
+
# invoke scan operator(s) until we run out of source records or hit the num_samples limit
|
|
199
|
+
if isinstance(operator, ScanPhysicalOp):
|
|
209
200
|
if keep_scanning_source_records:
|
|
210
|
-
#
|
|
211
|
-
|
|
212
|
-
# it is simply a vessel to inform the scan_operator which record to fetch
|
|
213
|
-
candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
|
|
214
|
-
candidate.idx = current_scan_idx
|
|
215
|
-
candidate.get_item_fn = datasource.get_item
|
|
216
|
-
|
|
217
|
-
# run DataSourcePhysicalOp on record
|
|
218
|
-
record_set = operator(candidate)
|
|
201
|
+
# run ScanPhysicalOp on current scan index
|
|
202
|
+
record_set = operator(current_scan_idx)
|
|
219
203
|
records = record_set.data_records
|
|
220
204
|
record_op_stats = record_set.record_op_stats
|
|
221
205
|
|
|
@@ -230,8 +214,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
230
214
|
elif isinstance(operator, AggregateOp):
|
|
231
215
|
upstream_ops_are_finished = True
|
|
232
216
|
for upstream_op_idx in range(op_idx):
|
|
233
|
-
#
|
|
234
|
-
if isinstance(plan.operators[upstream_op_idx],
|
|
217
|
+
# scan operators do not have processing queues
|
|
218
|
+
if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
|
|
235
219
|
continue
|
|
236
220
|
|
|
237
221
|
# check upstream ops which do have a processing queue
|
|
@@ -266,7 +250,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
266
250
|
if not self.nocache:
|
|
267
251
|
for record in records:
|
|
268
252
|
if getattr(record, "passed_operator", True):
|
|
269
|
-
self.datadir.append_cache(operator.target_cache_id, record)
|
|
253
|
+
# self.datadir.append_cache(operator.target_cache_id, record)
|
|
254
|
+
pass
|
|
270
255
|
|
|
271
256
|
# update processing_queues or output_records
|
|
272
257
|
for record in records:
|
|
@@ -279,7 +264,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
279
264
|
|
|
280
265
|
# update finished_executing based on whether all records have been processed
|
|
281
266
|
still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
|
|
282
|
-
keep_scanning_source_records = current_scan_idx <
|
|
267
|
+
keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
|
|
283
268
|
finished_executing = not keep_scanning_source_records and not still_processing
|
|
284
269
|
|
|
285
270
|
# update finished_executing based on limit
|
|
@@ -288,8 +273,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
288
273
|
|
|
289
274
|
# if caching was allowed, close the cache
|
|
290
275
|
if not self.nocache:
|
|
291
|
-
for
|
|
292
|
-
self.datadir.close_cache(operator.target_cache_id)
|
|
276
|
+
for _ in plan.operators:
|
|
277
|
+
# self.datadir.close_cache(operator.target_cache_id)
|
|
278
|
+
pass
|
|
293
279
|
|
|
294
280
|
# finalize plan stats
|
|
295
281
|
total_plan_time = time.time() - plan_start_time
|