palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
- palimpzest-0.7.0.dist-info/RECORD +96 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.3.dist-info/RECORD +0 -87
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstima
|
|
|
7
7
|
from palimpzest.core.elements.records import DataRecord
|
|
8
8
|
from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
|
|
9
9
|
from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
|
|
10
|
-
from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
|
|
10
|
+
from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
|
|
11
11
|
from palimpzest.utils.sandbox import API
|
|
12
12
|
|
|
13
13
|
# TYPE DEFINITIONS
|
|
@@ -24,7 +24,7 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
24
24
|
self,
|
|
25
25
|
exemplar_generation_model: Model = Model.GPT_4o,
|
|
26
26
|
code_synth_model: Model = Model.GPT_4o,
|
|
27
|
-
|
|
27
|
+
fallback_model: Model = Model.GPT_4o_MINI,
|
|
28
28
|
*args,
|
|
29
29
|
**kwargs,
|
|
30
30
|
):
|
|
@@ -34,7 +34,7 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
34
34
|
# set models
|
|
35
35
|
self.exemplar_generation_model = exemplar_generation_model
|
|
36
36
|
self.code_synth_model = code_synth_model
|
|
37
|
-
self.
|
|
37
|
+
self.fallback_model = fallback_model
|
|
38
38
|
|
|
39
39
|
# initialize parameters
|
|
40
40
|
self.field_to_code_ensemble = None
|
|
@@ -58,7 +58,7 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
58
58
|
id_params = {
|
|
59
59
|
"exemplar_generation_model": self.exemplar_generation_model.value,
|
|
60
60
|
"code_synth_model": self.code_synth_model.value,
|
|
61
|
-
"
|
|
61
|
+
"fallback_model": self.fallback_model.value,
|
|
62
62
|
**id_params,
|
|
63
63
|
}
|
|
64
64
|
|
|
@@ -69,7 +69,7 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
69
69
|
op_params = {
|
|
70
70
|
"exemplar_generation_model": self.exemplar_generation_model,
|
|
71
71
|
"code_synth_model": self.code_synth_model,
|
|
72
|
-
"
|
|
72
|
+
"fallback_model": self.fallback_model,
|
|
73
73
|
**op_params,
|
|
74
74
|
}
|
|
75
75
|
|
|
@@ -89,7 +89,7 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
89
89
|
naive_op_cost_estimates.time_per_record = 1e-5
|
|
90
90
|
naive_op_cost_estimates.time_per_record_lower_bound = 1e-5
|
|
91
91
|
naive_op_cost_estimates.time_per_record_upper_bound = 1e-5
|
|
92
|
-
naive_op_cost_estimates.cost_per_record = 1e-6
|
|
92
|
+
naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
|
|
93
93
|
naive_op_cost_estimates.cost_per_record_lower_bound = 1e-6
|
|
94
94
|
naive_op_cost_estimates.cost_per_record_upper_bound = 1e-6
|
|
95
95
|
naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * (GPT_4o_MODEL_CARD["code"] / 100.0) * 0.7
|
|
@@ -149,7 +149,9 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
149
149
|
# set field_to_code_ensemble and code_synthesized to True
|
|
150
150
|
return field_to_code_ensemble, generation_stats
|
|
151
151
|
|
|
152
|
-
def _bonded_query_fallback(
|
|
152
|
+
def _bonded_query_fallback(
|
|
153
|
+
self, candidate: DataRecord
|
|
154
|
+
) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
|
|
153
155
|
fields_to_generate = self.get_fields_to_generate(candidate)
|
|
154
156
|
projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
|
|
155
157
|
|
|
@@ -181,7 +183,9 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
181
183
|
"""Code synthesis is disallowed on image conversions, so this must be False."""
|
|
182
184
|
return False
|
|
183
185
|
|
|
184
|
-
def convert(
|
|
186
|
+
def convert(
|
|
187
|
+
self, candidate: DataRecord, fields: list[str] | None = None
|
|
188
|
+
) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
|
|
185
189
|
# get the dictionary fields for the candidate
|
|
186
190
|
candidate_dict = candidate.to_dict(include_bytes=False, project_cols=self.depends_on)
|
|
187
191
|
|
|
@@ -220,18 +224,18 @@ class CodeSynthesisConvert(LLMConvert):
|
|
|
220
224
|
field_answers[field_name] = [answer]
|
|
221
225
|
|
|
222
226
|
else:
|
|
223
|
-
# if there is a failure, run a conventional query
|
|
227
|
+
# if there is a failure, run a conventional llm convert query for the field
|
|
224
228
|
if self.verbose:
|
|
225
229
|
print(f"CODEGEN FALLING BACK TO CONVENTIONAL FOR FIELD {field_name}")
|
|
226
230
|
|
|
227
|
-
# execute the conventional convert
|
|
228
|
-
|
|
231
|
+
# execute the conventional llm convert
|
|
232
|
+
convert_op = LLMConvertBonded(
|
|
229
233
|
input_schema=self.input_schema,
|
|
230
234
|
output_schema=self.output_schema,
|
|
231
|
-
model=self.
|
|
235
|
+
model=self.fallback_model,
|
|
232
236
|
prompt_strategy=self.prompt_strategy,
|
|
233
237
|
)
|
|
234
|
-
single_field_answers, single_field_stats =
|
|
238
|
+
single_field_answers, single_field_stats = convert_op.convert(candidate, [field_name])
|
|
235
239
|
|
|
236
240
|
# include code execution time in single_field_stats
|
|
237
241
|
single_field_stats.fn_call_duration_secs += exec_stats.fn_call_duration_secs
|
|
@@ -318,7 +322,7 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
|
|
|
318
322
|
gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
|
|
319
323
|
|
|
320
324
|
# invoke the champion model to generate the code
|
|
321
|
-
pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
|
|
325
|
+
pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
|
|
322
326
|
ordered_keys = [f"```{language}", f"```{language.lower()}", "```"]
|
|
323
327
|
code = None
|
|
324
328
|
if not pred:
|
|
@@ -337,7 +341,9 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
|
|
|
337
341
|
|
|
338
342
|
return code, stats
|
|
339
343
|
|
|
340
|
-
def _synthesize_field_code(
|
|
344
|
+
def _synthesize_field_code(
|
|
345
|
+
self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs
|
|
346
|
+
):
|
|
341
347
|
code, generation_stats = self._code_synth_single(
|
|
342
348
|
candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars]
|
|
343
349
|
)
|
|
@@ -354,7 +360,9 @@ class CodeSynthesisConvertExampleEnsemble(CodeSynthesisConvertSingle):
|
|
|
354
360
|
return False
|
|
355
361
|
return not self.code_synthesized
|
|
356
362
|
|
|
357
|
-
def _synthesize_field_code(
|
|
363
|
+
def _synthesize_field_code(
|
|
364
|
+
self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs
|
|
365
|
+
):
|
|
358
366
|
# creates an ensemble of `code_ensemble_num` synthesized functions; each of
|
|
359
367
|
# which uses a different exemplar (modulo the # of exemplars) for its synthesis
|
|
360
368
|
code_ensemble = {}
|
|
@@ -425,13 +433,20 @@ class CodeSynthesisConvertAdviceEnsemble(CodeSynthesisConvertSingle):
|
|
|
425
433
|
# set prompt for generator
|
|
426
434
|
gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
|
|
427
435
|
|
|
428
|
-
pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
|
|
436
|
+
pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
|
|
429
437
|
advs = self._parse_multiple_outputs(pred, outputs=[f"Idea {i}" for i in range(1, limit + 1)])
|
|
430
438
|
|
|
431
439
|
return advs, stats
|
|
432
440
|
|
|
433
441
|
def _synthesize_field_code(
|
|
434
|
-
self,
|
|
442
|
+
self,
|
|
443
|
+
candidate: DataRecord,
|
|
444
|
+
api: API,
|
|
445
|
+
output_field_name: str,
|
|
446
|
+
code_ensemble_num: int = 1,
|
|
447
|
+
num_exemplars: int = 1,
|
|
448
|
+
*args,
|
|
449
|
+
**kwargs,
|
|
435
450
|
):
|
|
436
451
|
# a more advanced approach in which advice is first solicited, and then
|
|
437
452
|
# provided as context when synthesizing the code ensemble
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Callable
|
|
6
6
|
|
|
7
7
|
from palimpzest.constants import (
|
|
8
8
|
MODEL_CARDS,
|
|
@@ -15,13 +15,11 @@ from palimpzest.constants import (
|
|
|
15
15
|
)
|
|
16
16
|
from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
|
|
17
17
|
from palimpzest.core.elements.records import DataRecord, DataRecordSet
|
|
18
|
+
from palimpzest.core.lib.fields import Field
|
|
18
19
|
from palimpzest.query.generators.generators import generator_factory
|
|
19
20
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
20
21
|
from palimpzest.utils.model_helpers import get_vision_models
|
|
21
22
|
|
|
22
|
-
# TYPE DEFINITIONS
|
|
23
|
-
FieldName = str
|
|
24
|
-
|
|
25
23
|
|
|
26
24
|
class ConvertOp(PhysicalOperator, ABC):
|
|
27
25
|
def __init__(
|
|
@@ -49,18 +47,13 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
49
47
|
|
|
50
48
|
def get_op_params(self):
|
|
51
49
|
op_params = super().get_op_params()
|
|
52
|
-
op_params = {
|
|
53
|
-
"cardinality": self.cardinality,
|
|
54
|
-
"udf": self.udf,
|
|
55
|
-
"desc": self.desc,
|
|
56
|
-
**op_params
|
|
57
|
-
}
|
|
50
|
+
op_params = {"cardinality": self.cardinality, "udf": self.udf, "desc": self.desc, **op_params}
|
|
58
51
|
|
|
59
52
|
return op_params
|
|
60
53
|
|
|
61
54
|
def _create_data_records_from_field_answers(
|
|
62
55
|
self,
|
|
63
|
-
field_answers: dict[
|
|
56
|
+
field_answers: dict[str, list],
|
|
64
57
|
candidate: DataRecord,
|
|
65
58
|
) -> list[DataRecord]:
|
|
66
59
|
"""
|
|
@@ -94,7 +87,7 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
94
87
|
if field not in input_fields:
|
|
95
88
|
value = field_answers[field][idx] if idx < len(field_answers[field]) else None
|
|
96
89
|
setattr(dr, field, value)
|
|
97
|
-
|
|
90
|
+
|
|
98
91
|
# append data record to list of output data records
|
|
99
92
|
drs.append(dr)
|
|
100
93
|
|
|
@@ -103,7 +96,7 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
103
96
|
def _create_record_set(
|
|
104
97
|
self,
|
|
105
98
|
records: list[DataRecord],
|
|
106
|
-
|
|
99
|
+
field_names: list[str],
|
|
107
100
|
generation_stats: GenerationStats,
|
|
108
101
|
total_time: float,
|
|
109
102
|
successful_convert: bool,
|
|
@@ -128,15 +121,17 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
128
121
|
time_per_record=time_per_record,
|
|
129
122
|
cost_per_record=per_record_stats.cost_per_record,
|
|
130
123
|
model_name=self.get_model_name(),
|
|
131
|
-
answer={field_name: getattr(dr, field_name) for field_name in
|
|
124
|
+
answer={field_name: getattr(dr, field_name) for field_name in field_names},
|
|
132
125
|
input_fields=self.input_schema.field_names(),
|
|
133
|
-
generated_fields=
|
|
126
|
+
generated_fields=field_names,
|
|
134
127
|
total_input_tokens=per_record_stats.total_input_tokens,
|
|
135
128
|
total_output_tokens=per_record_stats.total_output_tokens,
|
|
136
129
|
total_input_cost=per_record_stats.total_input_cost,
|
|
137
130
|
total_output_cost=per_record_stats.total_output_cost,
|
|
138
131
|
llm_call_duration_secs=per_record_stats.llm_call_duration_secs,
|
|
139
132
|
fn_call_duration_secs=per_record_stats.fn_call_duration_secs,
|
|
133
|
+
total_llm_calls=per_record_stats.total_llm_calls,
|
|
134
|
+
total_embedding_llm_calls=per_record_stats.total_embedding_llm_calls,
|
|
140
135
|
failed_convert=(not successful_convert),
|
|
141
136
|
image_operation=self.is_image_conversion(),
|
|
142
137
|
op_details={k: str(v) for k, v in self.get_id_params().items()},
|
|
@@ -153,7 +148,7 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
153
148
|
pass
|
|
154
149
|
|
|
155
150
|
@abstractmethod
|
|
156
|
-
def convert(self, candidate: DataRecord, fields:
|
|
151
|
+
def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
|
|
157
152
|
"""
|
|
158
153
|
This abstract method will be implemented by subclasses of ConvertOp to process the input DataRecord
|
|
159
154
|
and generate the value(s) for each of the specified fields. If the convert operator is a one-to-many
|
|
@@ -187,7 +182,8 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
187
182
|
|
|
188
183
|
# execute the convert
|
|
189
184
|
field_answers: dict[str, list]
|
|
190
|
-
|
|
185
|
+
fields = {field: field_type for field, field_type in self.output_schema.field_map().items() if field in fields_to_generate}
|
|
186
|
+
field_answers, generation_stats = self.convert(candidate=candidate, fields=fields)
|
|
191
187
|
assert all([field in field_answers for field in fields_to_generate]), "Not all fields were generated!"
|
|
192
188
|
|
|
193
189
|
# replace any None values with an empty list; subclasses may override __call__ to change this behavior
|
|
@@ -199,7 +195,7 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
199
195
|
# construct and return DataRecordSet
|
|
200
196
|
record_set = self._create_record_set(
|
|
201
197
|
records=drs,
|
|
202
|
-
|
|
198
|
+
field_names=fields_to_generate,
|
|
203
199
|
generation_stats=generation_stats,
|
|
204
200
|
total_time=time.time() - start_time,
|
|
205
201
|
successful_convert=successful_convert,
|
|
@@ -211,7 +207,7 @@ class ConvertOp(PhysicalOperator, ABC):
|
|
|
211
207
|
class NonLLMConvert(ConvertOp):
|
|
212
208
|
def __str__(self):
|
|
213
209
|
op = super().__str__()
|
|
214
|
-
op += f" UDF: {
|
|
210
|
+
op += f" UDF: {self.udf.__name__}\n"
|
|
215
211
|
return op
|
|
216
212
|
|
|
217
213
|
def is_image_conversion(self) -> bool:
|
|
@@ -239,7 +235,7 @@ class NonLLMConvert(ConvertOp):
|
|
|
239
235
|
quality=1.0,
|
|
240
236
|
)
|
|
241
237
|
|
|
242
|
-
def convert(self, candidate: DataRecord, fields:
|
|
238
|
+
def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
|
|
243
239
|
# apply UDF to input record
|
|
244
240
|
start_time = time.time()
|
|
245
241
|
field_answers = {}
|
|
@@ -249,7 +245,9 @@ class NonLLMConvert(ConvertOp):
|
|
|
249
245
|
|
|
250
246
|
if self.cardinality == Cardinality.ONE_TO_ONE:
|
|
251
247
|
# answer should be a dictionary
|
|
252
|
-
assert isinstance(answer, dict),
|
|
248
|
+
assert isinstance(answer, dict), (
|
|
249
|
+
"UDF must return a dictionary mapping each generated field to its value for one-to-one converts"
|
|
250
|
+
)
|
|
253
251
|
|
|
254
252
|
# wrap each answer in a list
|
|
255
253
|
field_answers = {field_name: [answer[field_name]] for field_name in fields}
|
|
@@ -263,7 +261,7 @@ class NonLLMConvert(ConvertOp):
|
|
|
263
261
|
field_answers[field_name].append(answer_dict.get(field_name, None))
|
|
264
262
|
|
|
265
263
|
if self.verbose:
|
|
266
|
-
print(f"{
|
|
264
|
+
print(f"{self.udf.__name__}:\n{answer}")
|
|
267
265
|
|
|
268
266
|
except Exception as e:
|
|
269
267
|
print(f"Error invoking user-defined function for convert: {e}")
|
|
@@ -279,6 +277,7 @@ class LLMConvert(ConvertOp):
|
|
|
279
277
|
"""
|
|
280
278
|
This is the base class for convert operations which use an LLM to generate the output fields.
|
|
281
279
|
"""
|
|
280
|
+
|
|
282
281
|
def __init__(
|
|
283
282
|
self,
|
|
284
283
|
model: Model,
|
|
@@ -337,9 +336,7 @@ class LLMConvert(ConvertOp):
|
|
|
337
336
|
# get est. of conversion time per record from model card;
|
|
338
337
|
# NOTE: model will only be None for code synthesis, which uses GPT-3.5 as fallback
|
|
339
338
|
model_name = self.model.value if getattr(self, "model", None) is not None else Model.GPT_4o_MINI.value
|
|
340
|
-
model_conversion_time_per_record =
|
|
341
|
-
MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
|
|
342
|
-
)
|
|
339
|
+
model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
|
|
343
340
|
|
|
344
341
|
# get est. of conversion cost (in USD) per record from model card
|
|
345
342
|
model_conversion_usd_per_record = (
|
|
@@ -362,74 +359,9 @@ class LLMConvert(ConvertOp):
|
|
|
362
359
|
)
|
|
363
360
|
|
|
364
361
|
|
|
365
|
-
class LLMConvertConventional(LLMConvert):
|
|
366
|
-
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
367
|
-
"""
|
|
368
|
-
Update the cost per record and time per record estimates to account for the additional
|
|
369
|
-
LLM calls we incur by executing one query per-field.
|
|
370
|
-
"""
|
|
371
|
-
# get naive cost estimates from LLMConvert
|
|
372
|
-
naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
|
|
373
|
-
|
|
374
|
-
# re-compute cost per record assuming we use fewer input tokens
|
|
375
|
-
est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS
|
|
376
|
-
est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
|
|
377
|
-
|
|
378
|
-
# increase estimates of the input and output tokens by the number of fields generated
|
|
379
|
-
# NOTE: this may over-estimate the number of fields that need to be generated
|
|
380
|
-
generate_field_names = []
|
|
381
|
-
for field_name in self.output_schema.field_names():
|
|
382
|
-
if field_name not in self.input_schema.field_names():
|
|
383
|
-
generate_field_names.append(field_name)
|
|
384
|
-
|
|
385
|
-
num_fields_to_generate = len(generate_field_names)
|
|
386
|
-
est_num_input_tokens *= num_fields_to_generate
|
|
387
|
-
est_num_output_tokens *= num_fields_to_generate
|
|
388
|
-
|
|
389
|
-
# get est. of conversion time per record from model card;
|
|
390
|
-
model_conversion_time_per_record = (
|
|
391
|
-
MODEL_CARDS[self.model.value]["seconds_per_output_token"] * est_num_output_tokens
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
# get est. of conversion cost (in USD) per record from model card
|
|
395
|
-
model_conversion_usd_per_record = (
|
|
396
|
-
MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
|
|
397
|
-
+ MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
# set refined estimate of time and cost per record
|
|
401
|
-
naive_op_cost_estimates.time_per_record = model_conversion_time_per_record
|
|
402
|
-
naive_op_cost_estimates.time_per_record_lower_bound = naive_op_cost_estimates.time_per_record
|
|
403
|
-
naive_op_cost_estimates.time_per_record_upper_bound = naive_op_cost_estimates.time_per_record
|
|
404
|
-
naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
|
|
405
|
-
naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
|
|
406
|
-
naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
|
|
407
|
-
|
|
408
|
-
return naive_op_cost_estimates
|
|
409
|
-
|
|
410
|
-
def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
|
|
411
|
-
# get the set of input fields to use for the convert operation
|
|
412
|
-
input_fields = self.get_input_fields()
|
|
413
|
-
|
|
414
|
-
# construct kwargs for generation
|
|
415
|
-
gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
|
|
416
|
-
|
|
417
|
-
# generate outputs one field at a time
|
|
418
|
-
field_answers, generation_stats_lst = {}, []
|
|
419
|
-
for field in fields:
|
|
420
|
-
single_field_answers, _, single_field_stats = self.generator(candidate, [field], **gen_kwargs)
|
|
421
|
-
field_answers.update(single_field_answers)
|
|
422
|
-
generation_stats_lst.append(single_field_stats)
|
|
423
|
-
|
|
424
|
-
# aggregate generation stats into single object
|
|
425
|
-
generation_stats = sum(generation_stats_lst)
|
|
426
|
-
|
|
427
|
-
return field_answers, generation_stats
|
|
428
|
-
|
|
429
|
-
|
|
430
362
|
class LLMConvertBonded(LLMConvert):
|
|
431
363
|
|
|
432
|
-
def convert(self, candidate: DataRecord, fields:
|
|
364
|
+
def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
|
|
433
365
|
# get the set of input fields to use for the convert operation
|
|
434
366
|
input_fields = self.get_input_fields()
|
|
435
367
|
|
|
@@ -437,13 +369,14 @@ class LLMConvertBonded(LLMConvert):
|
|
|
437
369
|
gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
|
|
438
370
|
|
|
439
371
|
# generate outputs for all fields in a single query
|
|
440
|
-
field_answers, _, generation_stats = self.generator(candidate, fields, **gen_kwargs)
|
|
372
|
+
field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
|
|
441
373
|
|
|
442
374
|
# if there was an error for any field, execute a conventional query on that field
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
375
|
+
if len(field_answers) > 1:
|
|
376
|
+
for field_name, answers in field_answers.items():
|
|
377
|
+
if answers is None:
|
|
378
|
+
single_field_answers, _, single_field_stats, _ = self.generator(candidate, {field_name: fields[field_name]}, **gen_kwargs)
|
|
379
|
+
field_answers.update(single_field_answers)
|
|
380
|
+
generation_stats += single_field_stats
|
|
448
381
|
|
|
449
382
|
return field_answers, generation_stats
|
|
@@ -93,18 +93,17 @@ class CriticAndRefineConvert(LLMConvert):
|
|
|
93
93
|
# NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
|
|
94
94
|
# execute the initial model
|
|
95
95
|
original_gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
|
|
96
|
-
field_answers, reasoning, original_gen_stats = self.generator(candidate, fields, **original_gen_kwargs)
|
|
97
|
-
original_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
|
|
98
|
-
original_messages = self.generator.get_messages()
|
|
96
|
+
field_answers, reasoning, original_gen_stats, original_messages = self.generator(candidate, fields, **original_gen_kwargs)
|
|
97
|
+
original_output = f"REASONING: {reasoning}\nANSWER: {field_answers}\n"
|
|
99
98
|
|
|
100
99
|
# execute the critic model
|
|
101
100
|
critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
|
|
102
|
-
|
|
103
|
-
critique_output = f"
|
|
101
|
+
_, reasoning, critic_gen_stats, _ = self.critic_generator(candidate, fields, json_output=False, **critic_gen_kwargs)
|
|
102
|
+
critique_output = f"CRITIQUE: {reasoning}\n"
|
|
104
103
|
|
|
105
104
|
# execute the refinement model
|
|
106
105
|
refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
|
|
107
|
-
field_answers, reasoning, refine_gen_stats = self.refine_generator(candidate, fields, **refine_gen_kwargs)
|
|
106
|
+
field_answers, reasoning, refine_gen_stats, _ = self.refine_generator(candidate, fields, **refine_gen_kwargs)
|
|
108
107
|
|
|
109
108
|
# compute the total generation stats
|
|
110
109
|
generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats
|
|
@@ -15,6 +15,7 @@ from palimpzest.constants import (
|
|
|
15
15
|
from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
|
|
16
16
|
from palimpzest.core.elements.filters import Filter
|
|
17
17
|
from palimpzest.core.elements.records import DataRecord, DataRecordSet
|
|
18
|
+
from palimpzest.core.lib.fields import BooleanField
|
|
18
19
|
from palimpzest.query.generators.generators import generator_factory
|
|
19
20
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
20
21
|
from palimpzest.utils.model_helpers import get_vision_models
|
|
@@ -96,6 +97,8 @@ class FilterOp(PhysicalOperator, ABC):
|
|
|
96
97
|
total_output_cost=generation_stats.total_output_cost,
|
|
97
98
|
llm_call_duration_secs=generation_stats.llm_call_duration_secs,
|
|
98
99
|
fn_call_duration_secs=generation_stats.fn_call_duration_secs,
|
|
100
|
+
total_llm_calls=generation_stats.total_llm_calls,
|
|
101
|
+
total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
|
|
99
102
|
answer=answer,
|
|
100
103
|
passed_operator=passed_operator,
|
|
101
104
|
image_operation=self.is_image_filter(),
|
|
@@ -248,14 +251,8 @@ class LLMFilter(FilterOp):
|
|
|
248
251
|
# construct kwargs for generation
|
|
249
252
|
gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
|
|
250
253
|
|
|
251
|
-
# generate output
|
|
252
|
-
|
|
254
|
+
# generate output; NOTE: BooleanField is used to indicate the output type; thus, the desc is not needed
|
|
255
|
+
fields = {"passed_operator": BooleanField(desc="")}
|
|
256
|
+
field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
|
|
253
257
|
|
|
254
|
-
|
|
255
|
-
passed_operator = False
|
|
256
|
-
if isinstance(field_answers["passed_operator"], str):
|
|
257
|
-
passed_operator = "true" in field_answers["passed_operator"].lower()
|
|
258
|
-
elif isinstance(field_answers["passed_operator"], bool):
|
|
259
|
-
passed_operator = field_answers["passed_operator"]
|
|
260
|
-
|
|
261
|
-
return {"passed_operator": passed_operator}, generation_stats
|
|
258
|
+
return field_answers, generation_stats
|
|
@@ -24,6 +24,7 @@ class LogicalOperator:
|
|
|
24
24
|
- GroupByAggregate (applies a group by on the Set)
|
|
25
25
|
- Aggregate (applies an aggregation on the Set)
|
|
26
26
|
- RetrieveScan (fetches documents from a provided input for a given query)
|
|
27
|
+
- Map (applies a function to each record in the Set without adding any new columns)
|
|
27
28
|
|
|
28
29
|
Every logical operator must declare the get_logical_id_params() and get_logical_op_params() methods,
|
|
29
30
|
which return dictionaries of parameters that are used to compute the logical op id and to implement
|
|
@@ -41,11 +42,9 @@ class LogicalOperator:
|
|
|
41
42
|
|
|
42
43
|
# compute the fields generated by this logical operator
|
|
43
44
|
input_field_names = self.input_schema.field_names() if self.input_schema is not None else []
|
|
44
|
-
self.generated_fields = sorted(
|
|
45
|
-
field_name
|
|
46
|
-
|
|
47
|
-
if field_name not in input_field_names
|
|
48
|
-
])
|
|
45
|
+
self.generated_fields = sorted(
|
|
46
|
+
[field_name for field_name in self.output_schema.field_names() if field_name not in input_field_names]
|
|
47
|
+
)
|
|
49
48
|
|
|
50
49
|
def __str__(self) -> str:
|
|
51
50
|
raise NotImplementedError("Abstract method")
|
|
@@ -76,7 +75,7 @@ class LogicalOperator:
|
|
|
76
75
|
"""
|
|
77
76
|
Returns a dictionary mapping of logical operator parameters which may be used to
|
|
78
77
|
implement a physical operator associated with this logical operation.
|
|
79
|
-
|
|
78
|
+
|
|
80
79
|
NOTE: Should be overriden by subclasses to include class-specific parameters.
|
|
81
80
|
"""
|
|
82
81
|
return {"input_schema": self.input_schema, "output_schema": self.output_schema}
|
|
@@ -101,6 +100,10 @@ class LogicalOperator:
|
|
|
101
100
|
|
|
102
101
|
return self.logical_op_id
|
|
103
102
|
|
|
103
|
+
def get_generated_fields(self) -> list[str]:
|
|
104
|
+
"""Returns the names of the fields generated by this logical operator."""
|
|
105
|
+
return self.generated_fields
|
|
106
|
+
|
|
104
107
|
def __hash__(self):
|
|
105
108
|
if not self.logical_op_id:
|
|
106
109
|
raise ValueError("logical_op_id not set, unable to hash")
|
|
@@ -278,6 +281,7 @@ class FilteredScan(LogicalOperator):
|
|
|
278
281
|
|
|
279
282
|
return logical_op_params
|
|
280
283
|
|
|
284
|
+
|
|
281
285
|
class GroupByAggregate(LogicalOperator):
|
|
282
286
|
def __init__(
|
|
283
287
|
self,
|
|
@@ -314,6 +318,7 @@ class GroupByAggregate(LogicalOperator):
|
|
|
314
318
|
|
|
315
319
|
return logical_op_params
|
|
316
320
|
|
|
321
|
+
|
|
317
322
|
class LimitScan(LogicalOperator):
|
|
318
323
|
def __init__(self, limit: int, target_cache_id: str | None = None, *args, **kwargs):
|
|
319
324
|
super().__init__(*args, **kwargs)
|
|
@@ -374,7 +379,7 @@ class RetrieveScan(LogicalOperator):
|
|
|
374
379
|
index,
|
|
375
380
|
search_func,
|
|
376
381
|
search_attr,
|
|
377
|
-
|
|
382
|
+
output_attrs,
|
|
378
383
|
k,
|
|
379
384
|
target_cache_id: str = None,
|
|
380
385
|
*args,
|
|
@@ -384,7 +389,7 @@ class RetrieveScan(LogicalOperator):
|
|
|
384
389
|
self.index = index
|
|
385
390
|
self.search_func = search_func
|
|
386
391
|
self.search_attr = search_attr
|
|
387
|
-
self.
|
|
392
|
+
self.output_attrs = output_attrs
|
|
388
393
|
self.k = k
|
|
389
394
|
self.target_cache_id = target_cache_id
|
|
390
395
|
|
|
@@ -398,7 +403,7 @@ class RetrieveScan(LogicalOperator):
|
|
|
398
403
|
logical_id_params = super().get_logical_id_params()
|
|
399
404
|
logical_id_params = {
|
|
400
405
|
"search_attr": self.search_attr,
|
|
401
|
-
"
|
|
406
|
+
"output_attrs": self.output_attrs,
|
|
402
407
|
"k": self.k,
|
|
403
408
|
**logical_id_params,
|
|
404
409
|
}
|
|
@@ -411,10 +416,49 @@ class RetrieveScan(LogicalOperator):
|
|
|
411
416
|
"index": self.index,
|
|
412
417
|
"search_func": self.search_func,
|
|
413
418
|
"search_attr": self.search_attr,
|
|
414
|
-
"
|
|
419
|
+
"output_attrs": self.output_attrs,
|
|
415
420
|
"k": self.k,
|
|
416
421
|
"target_cache_id": self.target_cache_id,
|
|
417
422
|
**logical_op_params,
|
|
418
423
|
}
|
|
419
424
|
|
|
420
425
|
return logical_op_params
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# TODO: (near-term) maybe we should try to fold this into ConvertScan, and make the internals of PZ
|
|
429
|
+
# amenable to a convert operator (with a UDF) that does not add new columns?
|
|
430
|
+
class MapScan(LogicalOperator):
|
|
431
|
+
"""A MapScan is a logical operator that applies a UDF to each input record without adding new columns."""
|
|
432
|
+
|
|
433
|
+
def __init__(
|
|
434
|
+
self,
|
|
435
|
+
udf: Callable | None = None,
|
|
436
|
+
target_cache_id: str | None = None,
|
|
437
|
+
*args,
|
|
438
|
+
**kwargs,
|
|
439
|
+
):
|
|
440
|
+
super().__init__(*args, **kwargs)
|
|
441
|
+
self.udf = udf
|
|
442
|
+
self.target_cache_id = target_cache_id
|
|
443
|
+
|
|
444
|
+
def __str__(self):
|
|
445
|
+
return f"MapScan({self.output_schema}, {self.udf.__name__})"
|
|
446
|
+
|
|
447
|
+
def get_logical_id_params(self) -> dict:
|
|
448
|
+
logical_id_params = super().get_logical_id_params()
|
|
449
|
+
logical_id_params = {
|
|
450
|
+
"udf": self.udf,
|
|
451
|
+
**logical_id_params,
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return logical_id_params
|
|
455
|
+
|
|
456
|
+
def get_logical_op_params(self) -> dict:
|
|
457
|
+
logical_op_params = super().get_logical_op_params()
|
|
458
|
+
logical_op_params = {
|
|
459
|
+
"udf": self.udf,
|
|
460
|
+
"target_cache_id": self.target_cache_id,
|
|
461
|
+
**logical_op_params,
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return logical_op_params
|