palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +343 -209
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +639 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +62 -6
- palimpzest/prompts/filter_prompts.py +51 -6
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
- palimpzest/prompts/prompt_factory.py +375 -47
- palimpzest/prompts/split_proposer_prompts.py +1 -1
- palimpzest/prompts/util_phrases.py +5 -0
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +160 -331
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +33 -19
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +26 -16
- palimpzest/query/operators/join.py +403 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +205 -77
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +42 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +32 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
- palimpzest-0.8.1.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.21.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,488 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from palimpzest.constants import Cardinality, GPT_4o_MODEL_CARD, Model
|
|
6
|
-
from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
|
|
7
|
-
from palimpzest.core.elements.records import DataRecord
|
|
8
|
-
from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
|
|
9
|
-
from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
|
|
10
|
-
from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
|
|
11
|
-
from palimpzest.utils.sandbox import API
|
|
12
|
-
|
|
13
|
-
# TYPE DEFINITIONS
|
|
14
|
-
FieldName = str
|
|
15
|
-
CodeName = str
|
|
16
|
-
Code = str
|
|
17
|
-
DataRecordDict = dict[str, Any]
|
|
18
|
-
Exemplar = tuple[DataRecordDict, DataRecordDict]
|
|
19
|
-
CodeEnsemble = dict[CodeName, Code]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class CodeSynthesisConvert(LLMConvert):
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
exemplar_generation_model: Model = Model.GPT_4o,
|
|
26
|
-
code_synth_model: Model = Model.GPT_4o,
|
|
27
|
-
fallback_model: Model = Model.GPT_4o_MINI,
|
|
28
|
-
*args,
|
|
29
|
-
**kwargs,
|
|
30
|
-
):
|
|
31
|
-
kwargs["model"] = None
|
|
32
|
-
super().__init__(*args, **kwargs)
|
|
33
|
-
|
|
34
|
-
# set models
|
|
35
|
-
self.exemplar_generation_model = exemplar_generation_model
|
|
36
|
-
self.code_synth_model = code_synth_model
|
|
37
|
-
self.fallback_model = fallback_model
|
|
38
|
-
|
|
39
|
-
# initialize parameters
|
|
40
|
-
self.field_to_code_ensemble = None
|
|
41
|
-
self.exemplars = []
|
|
42
|
-
self.code_synthesized = False
|
|
43
|
-
self.code_champion_generator = generator_factory(
|
|
44
|
-
model=self.code_synth_model,
|
|
45
|
-
prompt_strategy=self.prompt_strategy,
|
|
46
|
-
cardinality=Cardinality.ONE_TO_ONE,
|
|
47
|
-
verbose=self.verbose,
|
|
48
|
-
)
|
|
49
|
-
self.field_to_code_ensemble = {}
|
|
50
|
-
|
|
51
|
-
def __str__(self):
|
|
52
|
-
op = super().__str__()
|
|
53
|
-
op += f" Code Synth Strategy: {self.__class__.__name__}\n"
|
|
54
|
-
return op
|
|
55
|
-
|
|
56
|
-
def get_id_params(self):
|
|
57
|
-
id_params = super().get_id_params()
|
|
58
|
-
id_params = {
|
|
59
|
-
"exemplar_generation_model": self.exemplar_generation_model.value,
|
|
60
|
-
"code_synth_model": self.code_synth_model.value,
|
|
61
|
-
"fallback_model": self.fallback_model.value,
|
|
62
|
-
**id_params,
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
return id_params
|
|
66
|
-
|
|
67
|
-
def get_op_params(self):
|
|
68
|
-
op_params = super().get_op_params()
|
|
69
|
-
op_params = {
|
|
70
|
-
"exemplar_generation_model": self.exemplar_generation_model,
|
|
71
|
-
"code_synth_model": self.code_synth_model,
|
|
72
|
-
"fallback_model": self.fallback_model,
|
|
73
|
-
**op_params,
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
return op_params
|
|
77
|
-
|
|
78
|
-
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
79
|
-
"""
|
|
80
|
-
Currently we are using GPT-4 to generate code which we can then invoke on subsequent
|
|
81
|
-
inputs to this operator. To reflect this in our naive cost estimates, we assume that
|
|
82
|
-
the time_per_record is low (about the time it takes to execute a cheap python function)
|
|
83
|
-
and that the cost_per_record is also low (we amortize the cost of code generation across
|
|
84
|
-
all records). For our quality estimate, we naively assume some degredation in quality.
|
|
85
|
-
In practice, this naive quality estimate will be overwritten by the CostModel's estimate
|
|
86
|
-
once it executes a few code generated examples.
|
|
87
|
-
"""
|
|
88
|
-
naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
|
|
89
|
-
naive_op_cost_estimates.time_per_record = 1e-5
|
|
90
|
-
naive_op_cost_estimates.time_per_record_lower_bound = 1e-5
|
|
91
|
-
naive_op_cost_estimates.time_per_record_upper_bound = 1e-5
|
|
92
|
-
naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
|
|
93
|
-
naive_op_cost_estimates.cost_per_record_lower_bound = 1e-6
|
|
94
|
-
naive_op_cost_estimates.cost_per_record_upper_bound = 1e-6
|
|
95
|
-
naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * (GPT_4o_MODEL_CARD["code"] / 100.0) * 0.7
|
|
96
|
-
naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
|
|
97
|
-
naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
|
|
98
|
-
|
|
99
|
-
return naive_op_cost_estimates
|
|
100
|
-
|
|
101
|
-
def _should_synthesize(
|
|
102
|
-
self, exemplars: list[Exemplar], num_exemplars: int = 1, code_regenerate_frequency: int = 200, *args, **kwargs
|
|
103
|
-
) -> bool:
|
|
104
|
-
"""This function determines whether code synthesis should be performed based on the strategy and the number of exemplars available."""
|
|
105
|
-
raise NotImplementedError("This method should be implemented in a subclass")
|
|
106
|
-
|
|
107
|
-
def _synthesize_field_code(
|
|
108
|
-
self,
|
|
109
|
-
candidate: DataRecord,
|
|
110
|
-
api: API,
|
|
111
|
-
output_field_name: str,
|
|
112
|
-
code_ensemble_num: int = 1, # if strategy != SINGLE
|
|
113
|
-
num_exemplars: int = 1, # if strategy != EXAMPLE_ENSEMBLE
|
|
114
|
-
) -> tuple[dict[CodeName, Code], GenerationStats]:
|
|
115
|
-
"""This method is responsible for synthesizing the code on a per-field basis.
|
|
116
|
-
Wrapping different calls to the LLM and returning a set of per-field query statistics.
|
|
117
|
-
The format of the code ensemble dictionary is {code_name: code} where code_name is a string and code is a string representing the code.
|
|
118
|
-
"""
|
|
119
|
-
raise NotImplementedError("This method should be implemented in a subclass")
|
|
120
|
-
|
|
121
|
-
def synthesize_code_ensemble(self, fields_to_generate, candidate: DataRecord, *args, **kwargs):
|
|
122
|
-
"""This function is a wrapper around specific code synthesis methods
|
|
123
|
-
that wraps the synthesized code per-field in a dictionary and returns the stats object.
|
|
124
|
-
"""
|
|
125
|
-
# synthesize the per-field code ensembles
|
|
126
|
-
field_to_code_ensemble = {}
|
|
127
|
-
generation_stats = GenerationStats()
|
|
128
|
-
for field_name in fields_to_generate:
|
|
129
|
-
api = API.from_input_output_schemas(
|
|
130
|
-
input_schema=self.input_schema,
|
|
131
|
-
output_schema=self.output_schema,
|
|
132
|
-
field_name=field_name,
|
|
133
|
-
input_fields=candidate.get_field_names(),
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
# TODO here _synthesize_code should be called with the right parameters per-code-strategy?!
|
|
137
|
-
code_ensemble, code_synth_stats = self._synthesize_field_code(candidate, api, field_name)
|
|
138
|
-
|
|
139
|
-
# update mapping from fields to code ensemble and generation stats
|
|
140
|
-
field_to_code_ensemble[field_name] = code_ensemble
|
|
141
|
-
generation_stats += code_synth_stats
|
|
142
|
-
|
|
143
|
-
if self.verbose:
|
|
144
|
-
for code_name, code in code_ensemble.items():
|
|
145
|
-
print(f"CODE NAME: {code_name}")
|
|
146
|
-
print("-----------------------")
|
|
147
|
-
print(code)
|
|
148
|
-
|
|
149
|
-
# set field_to_code_ensemble and code_synthesized to True
|
|
150
|
-
return field_to_code_ensemble, generation_stats
|
|
151
|
-
|
|
152
|
-
def _bonded_query_fallback(
|
|
153
|
-
self, candidate: DataRecord
|
|
154
|
-
) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
|
|
155
|
-
fields_to_generate = self.get_fields_to_generate(candidate)
|
|
156
|
-
projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
|
|
157
|
-
|
|
158
|
-
# execute the bonded convert
|
|
159
|
-
bonded_op = LLMConvertBonded(
|
|
160
|
-
input_schema=self.input_schema,
|
|
161
|
-
output_schema=self.output_schema,
|
|
162
|
-
model=self.exemplar_generation_model,
|
|
163
|
-
prompt_strategy=self.prompt_strategy,
|
|
164
|
-
)
|
|
165
|
-
field_answers, generation_stats = bonded_op.convert(projected_candidate, fields_to_generate)
|
|
166
|
-
assert all([field in field_answers for field in fields_to_generate]), "Not all fields were generated!"
|
|
167
|
-
|
|
168
|
-
# for the vanilla LLMConvert, we simply replace any None values with an empty list
|
|
169
|
-
field_answers = {field: [] if answers is None else answers for field, answers in field_answers.items()}
|
|
170
|
-
|
|
171
|
-
# transform the mapping from fields to answers into a (list of) DataRecord(s)
|
|
172
|
-
drs, _ = self._create_data_records_from_field_answers(field_answers, candidate)
|
|
173
|
-
|
|
174
|
-
# NOTE: this now includes bytes input fields which will show up as: `field_name = "<bytes>"`;
|
|
175
|
-
# keep an eye out for a regression in code synth performance and revert if necessary
|
|
176
|
-
# update operator's set of exemplars
|
|
177
|
-
exemplars = [(projected_candidate.to_dict(include_bytes=False), dr.to_dict(include_bytes=False)) for dr in drs]
|
|
178
|
-
self.exemplars.extend(exemplars)
|
|
179
|
-
|
|
180
|
-
return field_answers, generation_stats
|
|
181
|
-
|
|
182
|
-
def is_image_conversion(self):
|
|
183
|
-
"""Code synthesis is disallowed on image conversions, so this must be False."""
|
|
184
|
-
return False
|
|
185
|
-
|
|
186
|
-
def convert(
|
|
187
|
-
self, candidate: DataRecord, fields: list[str] | None = None
|
|
188
|
-
) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
|
|
189
|
-
# get the dictionary fields for the candidate
|
|
190
|
-
candidate_dict = candidate.to_dict(include_bytes=False, project_cols=self.depends_on)
|
|
191
|
-
|
|
192
|
-
# Check if code was already synthesized, or if we have at least one converted sample
|
|
193
|
-
generation_stats = GenerationStats()
|
|
194
|
-
if self._should_synthesize():
|
|
195
|
-
self.field_to_code_ensemble, total_code_synth_stats = self.synthesize_code_ensemble(fields, candidate)
|
|
196
|
-
self.code_synthesized = True
|
|
197
|
-
generation_stats += total_code_synth_stats
|
|
198
|
-
|
|
199
|
-
# if we have yet to synthesize code (perhaps b/c we are waiting for more exemplars),
|
|
200
|
-
# use the exemplar generation model to perform the convert (and generate high-quality
|
|
201
|
-
# exemplars) using a bonded query
|
|
202
|
-
if not len(self.field_to_code_ensemble):
|
|
203
|
-
return self._bonded_query_fallback(candidate)
|
|
204
|
-
|
|
205
|
-
# if we have synthesized code run it on each field
|
|
206
|
-
field_answers = {}
|
|
207
|
-
for field_name in fields:
|
|
208
|
-
# create api instance for executing python code
|
|
209
|
-
api = API.from_input_output_schemas(
|
|
210
|
-
input_schema=self.input_schema,
|
|
211
|
-
output_schema=self.output_schema,
|
|
212
|
-
field_name=field_name,
|
|
213
|
-
input_fields=candidate_dict.keys(),
|
|
214
|
-
)
|
|
215
|
-
code_ensemble = self.field_to_code_ensemble[field_name]
|
|
216
|
-
|
|
217
|
-
# execute the code ensemble to get the answer
|
|
218
|
-
answer, _, exec_stats = code_ensemble_execution(api, code_ensemble, candidate_dict)
|
|
219
|
-
|
|
220
|
-
# if the answer is not None, update the field_answers
|
|
221
|
-
# NOTE: the answer will not be a list because code synth. is disallowed for one-to-many converts
|
|
222
|
-
if answer is not None:
|
|
223
|
-
generation_stats += exec_stats
|
|
224
|
-
field_answers[field_name] = [answer]
|
|
225
|
-
|
|
226
|
-
else:
|
|
227
|
-
# if there is a failure, run a conventional llm convert query for the field
|
|
228
|
-
if self.verbose:
|
|
229
|
-
print(f"CODEGEN FALLING BACK TO CONVENTIONAL FOR FIELD {field_name}")
|
|
230
|
-
|
|
231
|
-
# execute the conventional llm convert
|
|
232
|
-
convert_op = LLMConvertBonded(
|
|
233
|
-
input_schema=self.input_schema,
|
|
234
|
-
output_schema=self.output_schema,
|
|
235
|
-
model=self.fallback_model,
|
|
236
|
-
prompt_strategy=self.prompt_strategy,
|
|
237
|
-
)
|
|
238
|
-
single_field_answers, single_field_stats = convert_op.convert(candidate, [field_name])
|
|
239
|
-
|
|
240
|
-
# include code execution time in single_field_stats
|
|
241
|
-
single_field_stats.fn_call_duration_secs += exec_stats.fn_call_duration_secs
|
|
242
|
-
|
|
243
|
-
# update generation_stats
|
|
244
|
-
generation_stats += single_field_stats
|
|
245
|
-
|
|
246
|
-
# update field answers
|
|
247
|
-
# NOTE: because code synth. is disallowed for one-to-many queries, we make the first answer a singleton
|
|
248
|
-
field_answers[field_name] = (
|
|
249
|
-
[single_field_answers[field_name][0]]
|
|
250
|
-
if single_field_answers[field_name] is not None and len(single_field_answers[field_name]) > 0
|
|
251
|
-
else []
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
assert all([field in field_answers for field in fields]), "Not all fields were generated!"
|
|
255
|
-
|
|
256
|
-
# for the vanilla LLMConvert, we simply replace any None values with an empty list
|
|
257
|
-
field_answers = {field: [] if answers is None else answers for field, answers in field_answers.items()}
|
|
258
|
-
|
|
259
|
-
return field_answers, generation_stats
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
class CodeSynthesisConvertNone(CodeSynthesisConvert):
|
|
263
|
-
def _should_synthesize(self, *args, **kwargs):
|
|
264
|
-
return False
|
|
265
|
-
|
|
266
|
-
def _synthesize_field_code(self, candidate: DataRecord, api: API, *args, **kwargs):
|
|
267
|
-
code = api.api_def() + " return None\n"
|
|
268
|
-
code_ensemble = {"{api.name}_v0": code}
|
|
269
|
-
return code_ensemble, GenerationStats()
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
class CodeSynthesisConvertSingle(CodeSynthesisConvert):
|
|
273
|
-
def _should_synthesize(self, num_exemplars: int = 1, *args, **kwargs) -> bool:
|
|
274
|
-
"""This function determines whether code synthesis
|
|
275
|
-
should be performed based on the strategy and the number of exemplars available."""
|
|
276
|
-
if len(self.exemplars) < num_exemplars:
|
|
277
|
-
return False
|
|
278
|
-
return not self.code_synthesized
|
|
279
|
-
|
|
280
|
-
def _code_synth_single(
|
|
281
|
-
self,
|
|
282
|
-
candidate: DataRecord,
|
|
283
|
-
api: API,
|
|
284
|
-
output_field_name: str,
|
|
285
|
-
exemplars: list[Exemplar] | None = None,
|
|
286
|
-
advice: str | None = None,
|
|
287
|
-
language="Python",
|
|
288
|
-
):
|
|
289
|
-
if exemplars is None:
|
|
290
|
-
exemplars = []
|
|
291
|
-
|
|
292
|
-
context = {
|
|
293
|
-
"language": language,
|
|
294
|
-
"api": api.args_call(),
|
|
295
|
-
"output": api.output,
|
|
296
|
-
"inputs_desc": "\n".join(
|
|
297
|
-
[f"- {field_name} ({api.input_descs[i]})" for i, field_name in enumerate(api.inputs)]
|
|
298
|
-
),
|
|
299
|
-
"output_desc": api.output_desc,
|
|
300
|
-
"examples_desc": "\n".join(
|
|
301
|
-
[
|
|
302
|
-
EXAMPLE_PROMPT.format(
|
|
303
|
-
idx=f" {i}",
|
|
304
|
-
example_inputs="\n".join(
|
|
305
|
-
[f"- {field_name} = {repr(example[0][field_name])}" for field_name in example[0]]
|
|
306
|
-
),
|
|
307
|
-
example_output=f"{example[1][output_field_name]}",
|
|
308
|
-
)
|
|
309
|
-
for i, example in enumerate(exemplars)
|
|
310
|
-
]
|
|
311
|
-
),
|
|
312
|
-
"advice": f"Hint: {advice}" if advice else "",
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
prompt = CODEGEN_PROMPT.format(**context)
|
|
316
|
-
if self.verbose:
|
|
317
|
-
print("PROMPT")
|
|
318
|
-
print("-------")
|
|
319
|
-
print(f"{prompt}")
|
|
320
|
-
|
|
321
|
-
# set prompt for generator
|
|
322
|
-
gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
|
|
323
|
-
|
|
324
|
-
# invoke the champion model to generate the code
|
|
325
|
-
pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
|
|
326
|
-
ordered_keys = [f"```{language}", f"```{language.lower()}", "```"]
|
|
327
|
-
code = None
|
|
328
|
-
if not pred:
|
|
329
|
-
return code, stats
|
|
330
|
-
|
|
331
|
-
for key in ordered_keys:
|
|
332
|
-
if key in pred:
|
|
333
|
-
code = pred.split(key)[1].split("```")[0].strip()
|
|
334
|
-
break
|
|
335
|
-
|
|
336
|
-
if self.verbose:
|
|
337
|
-
print("-------")
|
|
338
|
-
print("SYNTHESIZED CODE")
|
|
339
|
-
print("---------------")
|
|
340
|
-
print(f"{code}")
|
|
341
|
-
|
|
342
|
-
return code, stats
|
|
343
|
-
|
|
344
|
-
def _synthesize_field_code(
|
|
345
|
-
self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs
|
|
346
|
-
):
|
|
347
|
-
code, generation_stats = self._code_synth_single(
|
|
348
|
-
candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars]
|
|
349
|
-
)
|
|
350
|
-
code_ensemble = {f"{api.name}_v0": code}
|
|
351
|
-
return code_ensemble, generation_stats
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
# NOTE A nicer truly class based approach would re-implement the code_synth_single method with calls to
|
|
355
|
-
# __super__ and then only re-implement the differences instead of having the code in the superclass know
|
|
356
|
-
# about the subclass-specific parameters (i.e., advice).
|
|
357
|
-
class CodeSynthesisConvertExampleEnsemble(CodeSynthesisConvertSingle):
|
|
358
|
-
def _should_synthesize(self, num_exemplars: int = 1, *args, **kwargs) -> bool:
|
|
359
|
-
if len(self.exemplars) < num_exemplars:
|
|
360
|
-
return False
|
|
361
|
-
return not self.code_synthesized
|
|
362
|
-
|
|
363
|
-
def _synthesize_field_code(
|
|
364
|
-
self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs
|
|
365
|
-
):
|
|
366
|
-
# creates an ensemble of `code_ensemble_num` synthesized functions; each of
|
|
367
|
-
# which uses a different exemplar (modulo the # of exemplars) for its synthesis
|
|
368
|
-
code_ensemble = {}
|
|
369
|
-
generation_stats = GenerationStats()
|
|
370
|
-
for i in range(code_ensemble_num):
|
|
371
|
-
code_name = f"{api.name}_v{i}"
|
|
372
|
-
exemplar = self.exemplars[i % len(self.exemplars)]
|
|
373
|
-
code, stats = self._code_synth_single(candidate, api, output_field_name, exemplars=[exemplar])
|
|
374
|
-
code_ensemble[code_name] = code
|
|
375
|
-
generation_stats += stats
|
|
376
|
-
|
|
377
|
-
return code_ensemble, generation_stats
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
class CodeSynthesisConvertAdviceEnsemble(CodeSynthesisConvertSingle):
|
|
381
|
-
def _should_synthesize(self, *args, **kwargs):
|
|
382
|
-
return False
|
|
383
|
-
|
|
384
|
-
def _parse_multiple_outputs(self, text, outputs=None):
|
|
385
|
-
if outputs is None:
|
|
386
|
-
outputs = ["Thought", "Action"]
|
|
387
|
-
data = {}
|
|
388
|
-
for key in reversed(outputs):
|
|
389
|
-
if key + ":" in text:
|
|
390
|
-
remain, value = text.rsplit(key + ":", 1)
|
|
391
|
-
data[key.lower()] = value.strip()
|
|
392
|
-
text = remain
|
|
393
|
-
else:
|
|
394
|
-
data[key.lower()] = None
|
|
395
|
-
return data
|
|
396
|
-
|
|
397
|
-
def _synthesize_advice(
|
|
398
|
-
self,
|
|
399
|
-
candidate: DataRecord,
|
|
400
|
-
api: API,
|
|
401
|
-
output_field_name: str,
|
|
402
|
-
exemplars: list[Exemplar] | None = None,
|
|
403
|
-
language="Python",
|
|
404
|
-
n_advices=4,
|
|
405
|
-
limit: int = 3,
|
|
406
|
-
):
|
|
407
|
-
if exemplars is None:
|
|
408
|
-
exemplars = []
|
|
409
|
-
context = {
|
|
410
|
-
"language": language,
|
|
411
|
-
"api": api.args_call(),
|
|
412
|
-
"output": api.output,
|
|
413
|
-
"inputs_desc": "\n".join(
|
|
414
|
-
[f"- {field_name} ({api.input_descs[i]})" for i, field_name in enumerate(api.inputs)]
|
|
415
|
-
),
|
|
416
|
-
"output_desc": api.output_desc,
|
|
417
|
-
"examples_desc": "\n".join(
|
|
418
|
-
[
|
|
419
|
-
EXAMPLE_PROMPT.format(
|
|
420
|
-
idx=f" {i}",
|
|
421
|
-
example_inputs="\n".join(
|
|
422
|
-
[f"- {field_name} = {repr(example[0][field_name])}" for field_name in example[0]]
|
|
423
|
-
),
|
|
424
|
-
example_output=f"{example[1][output_field_name]}",
|
|
425
|
-
)
|
|
426
|
-
for i, example in enumerate(exemplars)
|
|
427
|
-
]
|
|
428
|
-
),
|
|
429
|
-
"n": n_advices,
|
|
430
|
-
}
|
|
431
|
-
prompt = ADVICEGEN_PROMPT.format(**context)
|
|
432
|
-
|
|
433
|
-
# set prompt for generator
|
|
434
|
-
gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
|
|
435
|
-
|
|
436
|
-
pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
|
|
437
|
-
advs = self._parse_multiple_outputs(pred, outputs=[f"Idea {i}" for i in range(1, limit + 1)])
|
|
438
|
-
|
|
439
|
-
return advs, stats
|
|
440
|
-
|
|
441
|
-
def _synthesize_field_code(
|
|
442
|
-
self,
|
|
443
|
-
candidate: DataRecord,
|
|
444
|
-
api: API,
|
|
445
|
-
output_field_name: str,
|
|
446
|
-
code_ensemble_num: int = 1,
|
|
447
|
-
num_exemplars: int = 1,
|
|
448
|
-
*args,
|
|
449
|
-
**kwargs,
|
|
450
|
-
):
|
|
451
|
-
# a more advanced approach in which advice is first solicited, and then
|
|
452
|
-
# provided as context when synthesizing the code ensemble
|
|
453
|
-
output_stats = {}
|
|
454
|
-
# solicit advice
|
|
455
|
-
advices, adv_stats = self._synthesize_advice(
|
|
456
|
-
candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars], n_advices=code_ensemble_num
|
|
457
|
-
)
|
|
458
|
-
for key, value in adv_stats.items():
|
|
459
|
-
if isinstance(value, dict):
|
|
460
|
-
for k2, v2 in value.items():
|
|
461
|
-
output_stats[k2] = output_stats.get(k2, 0) + v2
|
|
462
|
-
else:
|
|
463
|
-
output_stats[key] += output_stats.get(key, type(value)()) + value
|
|
464
|
-
|
|
465
|
-
code_ensemble = {}
|
|
466
|
-
# synthesize code ensemble
|
|
467
|
-
for i, adv in enumerate(advices):
|
|
468
|
-
code_name = f"{api.name}_v{i}"
|
|
469
|
-
code, stats = self._code_synth_single(
|
|
470
|
-
candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars], advice=adv
|
|
471
|
-
)
|
|
472
|
-
code_ensemble[code_name] = code
|
|
473
|
-
for key in output_stats:
|
|
474
|
-
output_stats[key] += stats[key]
|
|
475
|
-
return code_ensemble, output_stats
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
class CodeSynthesisConvertAdviceEnsembleValidation(CodeSynthesisConvert):
|
|
479
|
-
def _should_synthesize(self, code_regenerate_frequency: int = 200, *args, **kwargs):
|
|
480
|
-
return len(self.exemplars) % code_regenerate_frequency == 0
|
|
481
|
-
|
|
482
|
-
def _synthesize_field_code(
|
|
483
|
-
self, api: API, output_field_name: str, exemplars: list[Exemplar] = None, *args, **kwargs
|
|
484
|
-
):
|
|
485
|
-
# TODO this was not implemented ?
|
|
486
|
-
if exemplars is None:
|
|
487
|
-
exemplars = []
|
|
488
|
-
raise Exception("not implemented yet")
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import time
|
|
4
|
-
from typing import Callable
|
|
5
|
-
|
|
6
|
-
from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
|
|
7
|
-
from palimpzest.core.elements.records import DataRecord, DataRecordSet
|
|
8
|
-
from palimpzest.core.lib.fields import Field
|
|
9
|
-
from palimpzest.query.operators.physical import PhysicalOperator
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class MapOp(PhysicalOperator):
|
|
13
|
-
def __init__(self, udf: Callable | None = None, *args, **kwargs):
|
|
14
|
-
super().__init__(*args, **kwargs)
|
|
15
|
-
self.udf = udf
|
|
16
|
-
|
|
17
|
-
def __str__(self):
|
|
18
|
-
op = super().__str__()
|
|
19
|
-
op += f" UDF: {self.udf.__name__}\n"
|
|
20
|
-
return op
|
|
21
|
-
|
|
22
|
-
def get_id_params(self):
|
|
23
|
-
id_params = super().get_id_params()
|
|
24
|
-
id_params = {"udf": self.udf, **id_params}
|
|
25
|
-
|
|
26
|
-
return id_params
|
|
27
|
-
|
|
28
|
-
def get_op_params(self):
|
|
29
|
-
op_params = super().get_op_params()
|
|
30
|
-
op_params = {"udf": self.udf, **op_params}
|
|
31
|
-
|
|
32
|
-
return op_params
|
|
33
|
-
|
|
34
|
-
def _create_record_set(
|
|
35
|
-
self,
|
|
36
|
-
record: DataRecord,
|
|
37
|
-
generation_stats: GenerationStats,
|
|
38
|
-
total_time: float,
|
|
39
|
-
) -> DataRecordSet:
|
|
40
|
-
"""
|
|
41
|
-
Given an input DataRecord and a determination of whether it passed the filter or not,
|
|
42
|
-
construct the resulting RecordSet.
|
|
43
|
-
"""
|
|
44
|
-
# create RecordOpStats object
|
|
45
|
-
record_op_stats = RecordOpStats(
|
|
46
|
-
record_id=record.id,
|
|
47
|
-
record_parent_id=record.parent_id,
|
|
48
|
-
record_source_idx=record.source_idx,
|
|
49
|
-
record_state=record.to_dict(include_bytes=False),
|
|
50
|
-
full_op_id=self.get_full_op_id(),
|
|
51
|
-
logical_op_id=self.logical_op_id,
|
|
52
|
-
op_name=self.op_name(),
|
|
53
|
-
time_per_record=total_time,
|
|
54
|
-
cost_per_record=0.0,
|
|
55
|
-
fn_call_duration_secs=generation_stats.fn_call_duration_secs,
|
|
56
|
-
answer=record.to_dict(include_bytes=False),
|
|
57
|
-
op_details={k: str(v) for k, v in self.get_id_params().items()},
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
return DataRecordSet([record], [record_op_stats])
|
|
61
|
-
|
|
62
|
-
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
63
|
-
"""
|
|
64
|
-
Compute naive cost estimates for the Map operation. These estimates assume that the map UDF
|
|
65
|
-
(1) has no cost and (2) has perfect quality.
|
|
66
|
-
"""
|
|
67
|
-
# estimate 1 ms single-threaded execution for udf function
|
|
68
|
-
time_per_record = 0.001
|
|
69
|
-
|
|
70
|
-
# assume filter fn has perfect quality
|
|
71
|
-
return OperatorCostEstimates(
|
|
72
|
-
cardinality=source_op_cost_estimates.cardinality,
|
|
73
|
-
time_per_record=time_per_record,
|
|
74
|
-
cost_per_record=0.0,
|
|
75
|
-
quality=1.0,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
def map(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
|
|
79
|
-
# apply UDF to input record
|
|
80
|
-
start_time = time.time()
|
|
81
|
-
field_answers = {}
|
|
82
|
-
try:
|
|
83
|
-
# execute the UDF function
|
|
84
|
-
field_answers = self.udf(candidate.to_dict())
|
|
85
|
-
|
|
86
|
-
# answer should be a dictionary
|
|
87
|
-
assert isinstance(field_answers, dict), (
|
|
88
|
-
"UDF must return a dictionary mapping each input field to its value for map operations"
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
if self.verbose:
|
|
92
|
-
print(f"{self.udf.__name__}")
|
|
93
|
-
|
|
94
|
-
except Exception as e:
|
|
95
|
-
print(f"Error invoking user-defined function for map: {e}")
|
|
96
|
-
raise e
|
|
97
|
-
|
|
98
|
-
# create generation stats object containing the time spent executing the UDF function
|
|
99
|
-
generation_stats = GenerationStats(fn_call_duration_secs=time.time() - start_time)
|
|
100
|
-
|
|
101
|
-
return field_answers, generation_stats
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def __call__(self, candidate: DataRecord) -> DataRecordSet:
|
|
105
|
-
"""
|
|
106
|
-
This method converts an input DataRecord into an output DataRecordSet. The output DataRecordSet contains the
|
|
107
|
-
DataRecord(s) output by the operator's convert() method and their corresponding RecordOpStats objects.
|
|
108
|
-
Some subclasses may override this __call__method to implement their own custom logic.
|
|
109
|
-
"""
|
|
110
|
-
start_time = time.time()
|
|
111
|
-
|
|
112
|
-
# execute the map operation
|
|
113
|
-
field_answers: dict[str, list]
|
|
114
|
-
fields = {field: field_type for field, field_type in self.output_schema.field_map().items()}
|
|
115
|
-
field_answers, generation_stats = self.map(candidate=candidate, fields=fields)
|
|
116
|
-
assert all([field in field_answers for field in fields]), "Not all fields are present in output of map!"
|
|
117
|
-
|
|
118
|
-
# construct DataRecord from field_answers
|
|
119
|
-
dr = DataRecord.from_parent(schema=self.output_schema, parent_record=candidate)
|
|
120
|
-
for field_name, field_value in field_answers.items():
|
|
121
|
-
dr[field_name] = field_value
|
|
122
|
-
|
|
123
|
-
# construct and return DataRecordSet
|
|
124
|
-
record_set = self._create_record_set(
|
|
125
|
-
record=dr,
|
|
126
|
-
generation_stats=generation_stats,
|
|
127
|
-
total_time=time.time() - start_time,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
return record_set
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from palimpzest.core.data.dataclasses import ExecutionStats
|
|
4
|
-
from palimpzest.core.elements.records import DataRecordCollection
|
|
5
|
-
from palimpzest.query.processor.query_processor import QueryProcessor
|
|
6
|
-
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
class NoSentinelQueryProcessor(QueryProcessor):
|
|
10
|
-
"""
|
|
11
|
-
Query processor that uses naive cost estimates to select the best plan.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
# TODO: Consider to support dry_run.
|
|
15
|
-
def execute(self) -> DataRecordCollection:
|
|
16
|
-
logger.info("Executing NoSentinelQueryProcessor")
|
|
17
|
-
|
|
18
|
-
# create execution stats
|
|
19
|
-
execution_stats = ExecutionStats(execution_id=self.execution_id())
|
|
20
|
-
execution_stats.start()
|
|
21
|
-
|
|
22
|
-
# execute plan(s) according to the optimization strategy
|
|
23
|
-
records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
|
|
24
|
-
|
|
25
|
-
# update the execution stats to account for the work to execute the final plan
|
|
26
|
-
execution_stats.add_plan_stats(plan_stats)
|
|
27
|
-
execution_stats.finish()
|
|
28
|
-
|
|
29
|
-
# construct and return the DataRecordCollection
|
|
30
|
-
result = DataRecordCollection(records, execution_stats=execution_stats)
|
|
31
|
-
logger.info("Done executing NoSentinelQueryProcessor")
|
|
32
|
-
|
|
33
|
-
return result
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
|
|
3
|
-
from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
|
|
4
|
-
from palimpzest.query.processor.nosentinel_processor import NoSentinelQueryProcessor
|
|
5
|
-
from palimpzest.query.processor.sentinel_processor import SentinelQueryProcessor
|
|
6
|
-
from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class ProcessingStrategyType(Enum):
|
|
10
|
-
"""How to generate and optimize query plans"""
|
|
11
|
-
SENTINEL = SentinelQueryProcessor
|
|
12
|
-
NO_SENTINEL = NoSentinelQueryProcessor
|
|
13
|
-
STREAMING = StreamingQueryProcessor
|
|
14
|
-
|
|
15
|
-
def valid_execution_strategies(self) -> list[ExecutionStrategyType]:
|
|
16
|
-
"""
|
|
17
|
-
Returns a list of valid execution strategies for the given processing strategy.
|
|
18
|
-
"""
|
|
19
|
-
if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
|
|
20
|
-
return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
|
|
21
|
-
elif self == ProcessingStrategyType.STREAMING:
|
|
22
|
-
return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
|
|
23
|
-
|
|
24
|
-
def is_sentinel_strategy(self) -> bool:
|
|
25
|
-
"""
|
|
26
|
-
Returns True if the query processor associated with this strategy uses sentinel execution.
|
|
27
|
-
"""
|
|
28
|
-
return self == ProcessingStrategyType.SENTINEL
|