palimpzest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.0.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstima
7
7
  from palimpzest.core.elements.records import DataRecord
8
8
  from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
9
9
  from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
10
- from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded, LLMConvertConventional
10
+ from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
11
11
  from palimpzest.utils.sandbox import API
12
12
 
13
13
  # TYPE DEFINITIONS
@@ -24,7 +24,7 @@ class CodeSynthesisConvert(LLMConvert):
24
24
  self,
25
25
  exemplar_generation_model: Model = Model.GPT_4o,
26
26
  code_synth_model: Model = Model.GPT_4o,
27
- conventional_fallback_model: Model = Model.GPT_4o_MINI,
27
+ fallback_model: Model = Model.GPT_4o_MINI,
28
28
  *args,
29
29
  **kwargs,
30
30
  ):
@@ -34,7 +34,7 @@ class CodeSynthesisConvert(LLMConvert):
34
34
  # set models
35
35
  self.exemplar_generation_model = exemplar_generation_model
36
36
  self.code_synth_model = code_synth_model
37
- self.conventional_fallback_model = conventional_fallback_model
37
+ self.fallback_model = fallback_model
38
38
 
39
39
  # initialize parameters
40
40
  self.field_to_code_ensemble = None
@@ -58,7 +58,7 @@ class CodeSynthesisConvert(LLMConvert):
58
58
  id_params = {
59
59
  "exemplar_generation_model": self.exemplar_generation_model.value,
60
60
  "code_synth_model": self.code_synth_model.value,
61
- "conventional_fallback_model": self.conventional_fallback_model.value,
61
+ "fallback_model": self.fallback_model.value,
62
62
  **id_params,
63
63
  }
64
64
 
@@ -69,7 +69,7 @@ class CodeSynthesisConvert(LLMConvert):
69
69
  op_params = {
70
70
  "exemplar_generation_model": self.exemplar_generation_model,
71
71
  "code_synth_model": self.code_synth_model,
72
- "conventional_fallback_model": self.conventional_fallback_model,
72
+ "fallback_model": self.fallback_model,
73
73
  **op_params,
74
74
  }
75
75
 
@@ -89,7 +89,7 @@ class CodeSynthesisConvert(LLMConvert):
89
89
  naive_op_cost_estimates.time_per_record = 1e-5
90
90
  naive_op_cost_estimates.time_per_record_lower_bound = 1e-5
91
91
  naive_op_cost_estimates.time_per_record_upper_bound = 1e-5
92
- naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
92
+ naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
93
93
  naive_op_cost_estimates.cost_per_record_lower_bound = 1e-6
94
94
  naive_op_cost_estimates.cost_per_record_upper_bound = 1e-6
95
95
  naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * (GPT_4o_MODEL_CARD["code"] / 100.0) * 0.7
@@ -149,7 +149,9 @@ class CodeSynthesisConvert(LLMConvert):
149
149
  # set field_to_code_ensemble and code_synthesized to True
150
150
  return field_to_code_ensemble, generation_stats
151
151
 
152
- def _bonded_query_fallback(self, candidate: DataRecord) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
152
+ def _bonded_query_fallback(
153
+ self, candidate: DataRecord
154
+ ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
153
155
  fields_to_generate = self.get_fields_to_generate(candidate)
154
156
  projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
155
157
 
@@ -181,7 +183,9 @@ class CodeSynthesisConvert(LLMConvert):
181
183
  """Code synthesis is disallowed on image conversions, so this must be False."""
182
184
  return False
183
185
 
184
- def convert(self, candidate: DataRecord, fields: list[str] | None = None) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
186
+ def convert(
187
+ self, candidate: DataRecord, fields: list[str] | None = None
188
+ ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
185
189
  # get the dictionary fields for the candidate
186
190
  candidate_dict = candidate.to_dict(include_bytes=False, project_cols=self.depends_on)
187
191
 
@@ -220,18 +224,18 @@ class CodeSynthesisConvert(LLMConvert):
220
224
  field_answers[field_name] = [answer]
221
225
 
222
226
  else:
223
- # if there is a failure, run a conventional query
227
+ # if there is a failure, run a conventional llm convert query for the field
224
228
  if self.verbose:
225
229
  print(f"CODEGEN FALLING BACK TO CONVENTIONAL FOR FIELD {field_name}")
226
230
 
227
- # execute the conventional convert
228
- conventional_op = LLMConvertConventional(
231
+ # execute the conventional llm convert
232
+ convert_op = LLMConvertBonded(
229
233
  input_schema=self.input_schema,
230
234
  output_schema=self.output_schema,
231
- model=self.conventional_fallback_model,
235
+ model=self.fallback_model,
232
236
  prompt_strategy=self.prompt_strategy,
233
237
  )
234
- single_field_answers, single_field_stats = conventional_op.convert(candidate, [field_name])
238
+ single_field_answers, single_field_stats = convert_op.convert(candidate, [field_name])
235
239
 
236
240
  # include code execution time in single_field_stats
237
241
  single_field_stats.fn_call_duration_secs += exec_stats.fn_call_duration_secs
@@ -318,7 +322,7 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
318
322
  gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
319
323
 
320
324
  # invoke the champion model to generate the code
321
- pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
325
+ pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
322
326
  ordered_keys = [f"```{language}", f"```{language.lower()}", "```"]
323
327
  code = None
324
328
  if not pred:
@@ -337,7 +341,9 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
337
341
 
338
342
  return code, stats
339
343
 
340
- def _synthesize_field_code(self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs):
344
+ def _synthesize_field_code(
345
+ self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs
346
+ ):
341
347
  code, generation_stats = self._code_synth_single(
342
348
  candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars]
343
349
  )
@@ -354,7 +360,9 @@ class CodeSynthesisConvertExampleEnsemble(CodeSynthesisConvertSingle):
354
360
  return False
355
361
  return not self.code_synthesized
356
362
 
357
- def _synthesize_field_code(self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs):
363
+ def _synthesize_field_code(
364
+ self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs
365
+ ):
358
366
  # creates an ensemble of `code_ensemble_num` synthesized functions; each of
359
367
  # which uses a different exemplar (modulo the # of exemplars) for its synthesis
360
368
  code_ensemble = {}
@@ -425,13 +433,20 @@ class CodeSynthesisConvertAdviceEnsemble(CodeSynthesisConvertSingle):
425
433
  # set prompt for generator
426
434
  gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
427
435
 
428
- pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
436
+ pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
429
437
  advs = self._parse_multiple_outputs(pred, outputs=[f"Idea {i}" for i in range(1, limit + 1)])
430
438
 
431
439
  return advs, stats
432
440
 
433
441
  def _synthesize_field_code(
434
- self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, num_exemplars: int = 1, *args, **kwargs
442
+ self,
443
+ candidate: DataRecord,
444
+ api: API,
445
+ output_field_name: str,
446
+ code_ensemble_num: int = 1,
447
+ num_exemplars: int = 1,
448
+ *args,
449
+ **kwargs,
435
450
  ):
436
451
  # a more advanced approach in which advice is first solicited, and then
437
452
  # provided as context when synthesizing the code ensemble
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import time
4
4
  from abc import ABC, abstractmethod
5
- from typing import Any, Callable
5
+ from typing import Callable
6
6
 
7
7
  from palimpzest.constants import (
8
8
  MODEL_CARDS,
@@ -15,13 +15,11 @@ from palimpzest.constants import (
15
15
  )
16
16
  from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
17
17
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
18
+ from palimpzest.core.lib.fields import Field
18
19
  from palimpzest.query.generators.generators import generator_factory
19
20
  from palimpzest.query.operators.physical import PhysicalOperator
20
21
  from palimpzest.utils.model_helpers import get_vision_models
21
22
 
22
- # TYPE DEFINITIONS
23
- FieldName = str
24
-
25
23
 
26
24
  class ConvertOp(PhysicalOperator, ABC):
27
25
  def __init__(
@@ -49,18 +47,13 @@ class ConvertOp(PhysicalOperator, ABC):
49
47
 
50
48
  def get_op_params(self):
51
49
  op_params = super().get_op_params()
52
- op_params = {
53
- "cardinality": self.cardinality,
54
- "udf": self.udf,
55
- "desc": self.desc,
56
- **op_params
57
- }
50
+ op_params = {"cardinality": self.cardinality, "udf": self.udf, "desc": self.desc, **op_params}
58
51
 
59
52
  return op_params
60
53
 
61
54
  def _create_data_records_from_field_answers(
62
55
  self,
63
- field_answers: dict[FieldName, list[Any]],
56
+ field_answers: dict[str, list],
64
57
  candidate: DataRecord,
65
58
  ) -> list[DataRecord]:
66
59
  """
@@ -94,7 +87,7 @@ class ConvertOp(PhysicalOperator, ABC):
94
87
  if field not in input_fields:
95
88
  value = field_answers[field][idx] if idx < len(field_answers[field]) else None
96
89
  setattr(dr, field, value)
97
-
90
+
98
91
  # append data record to list of output data records
99
92
  drs.append(dr)
100
93
 
@@ -103,7 +96,7 @@ class ConvertOp(PhysicalOperator, ABC):
103
96
  def _create_record_set(
104
97
  self,
105
98
  records: list[DataRecord],
106
- fields: list[str],
99
+ field_names: list[str],
107
100
  generation_stats: GenerationStats,
108
101
  total_time: float,
109
102
  successful_convert: bool,
@@ -128,15 +121,17 @@ class ConvertOp(PhysicalOperator, ABC):
128
121
  time_per_record=time_per_record,
129
122
  cost_per_record=per_record_stats.cost_per_record,
130
123
  model_name=self.get_model_name(),
131
- answer={field_name: getattr(dr, field_name) for field_name in fields},
124
+ answer={field_name: getattr(dr, field_name) for field_name in field_names},
132
125
  input_fields=self.input_schema.field_names(),
133
- generated_fields=fields,
126
+ generated_fields=field_names,
134
127
  total_input_tokens=per_record_stats.total_input_tokens,
135
128
  total_output_tokens=per_record_stats.total_output_tokens,
136
129
  total_input_cost=per_record_stats.total_input_cost,
137
130
  total_output_cost=per_record_stats.total_output_cost,
138
131
  llm_call_duration_secs=per_record_stats.llm_call_duration_secs,
139
132
  fn_call_duration_secs=per_record_stats.fn_call_duration_secs,
133
+ total_llm_calls=per_record_stats.total_llm_calls,
134
+ total_embedding_llm_calls=per_record_stats.total_embedding_llm_calls,
140
135
  failed_convert=(not successful_convert),
141
136
  image_operation=self.is_image_conversion(),
142
137
  op_details={k: str(v) for k, v in self.get_id_params().items()},
@@ -153,7 +148,7 @@ class ConvertOp(PhysicalOperator, ABC):
153
148
  pass
154
149
 
155
150
  @abstractmethod
156
- def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
151
+ def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
157
152
  """
158
153
  This abstract method will be implemented by subclasses of ConvertOp to process the input DataRecord
159
154
  and generate the value(s) for each of the specified fields. If the convert operator is a one-to-many
@@ -187,7 +182,8 @@ class ConvertOp(PhysicalOperator, ABC):
187
182
 
188
183
  # execute the convert
189
184
  field_answers: dict[str, list]
190
- field_answers, generation_stats = self.convert(candidate=candidate, fields=fields_to_generate)
185
+ fields = {field: field_type for field, field_type in self.output_schema.field_map().items() if field in fields_to_generate}
186
+ field_answers, generation_stats = self.convert(candidate=candidate, fields=fields)
191
187
  assert all([field in field_answers for field in fields_to_generate]), "Not all fields were generated!"
192
188
 
193
189
  # replace any None values with an empty list; subclasses may override __call__ to change this behavior
@@ -199,7 +195,7 @@ class ConvertOp(PhysicalOperator, ABC):
199
195
  # construct and return DataRecordSet
200
196
  record_set = self._create_record_set(
201
197
  records=drs,
202
- fields=fields_to_generate,
198
+ field_names=fields_to_generate,
203
199
  generation_stats=generation_stats,
204
200
  total_time=time.time() - start_time,
205
201
  successful_convert=successful_convert,
@@ -211,7 +207,7 @@ class ConvertOp(PhysicalOperator, ABC):
211
207
  class NonLLMConvert(ConvertOp):
212
208
  def __str__(self):
213
209
  op = super().__str__()
214
- op += f" UDF: {str(self.udf)}\n"
210
+ op += f" UDF: {self.udf.__name__}\n"
215
211
  return op
216
212
 
217
213
  def is_image_conversion(self) -> bool:
@@ -239,7 +235,7 @@ class NonLLMConvert(ConvertOp):
239
235
  quality=1.0,
240
236
  )
241
237
 
242
- def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
238
+ def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
243
239
  # apply UDF to input record
244
240
  start_time = time.time()
245
241
  field_answers = {}
@@ -249,7 +245,9 @@ class NonLLMConvert(ConvertOp):
249
245
 
250
246
  if self.cardinality == Cardinality.ONE_TO_ONE:
251
247
  # answer should be a dictionary
252
- assert isinstance(answer, dict), "UDF must return a dictionary mapping each generated field to its value for one-to-one converts"
248
+ assert isinstance(answer, dict), (
249
+ "UDF must return a dictionary mapping each generated field to its value for one-to-one converts"
250
+ )
253
251
 
254
252
  # wrap each answer in a list
255
253
  field_answers = {field_name: [answer[field_name]] for field_name in fields}
@@ -263,7 +261,7 @@ class NonLLMConvert(ConvertOp):
263
261
  field_answers[field_name].append(answer_dict.get(field_name, None))
264
262
 
265
263
  if self.verbose:
266
- print(f"{str(self.udf)}:\n{answer}")
264
+ print(f"{self.udf.__name__}:\n{answer}")
267
265
 
268
266
  except Exception as e:
269
267
  print(f"Error invoking user-defined function for convert: {e}")
@@ -279,6 +277,7 @@ class LLMConvert(ConvertOp):
279
277
  """
280
278
  This is the base class for convert operations which use an LLM to generate the output fields.
281
279
  """
280
+
282
281
  def __init__(
283
282
  self,
284
283
  model: Model,
@@ -337,9 +336,7 @@ class LLMConvert(ConvertOp):
337
336
  # get est. of conversion time per record from model card;
338
337
  # NOTE: model will only be None for code synthesis, which uses GPT-3.5 as fallback
339
338
  model_name = self.model.value if getattr(self, "model", None) is not None else Model.GPT_4o_MINI.value
340
- model_conversion_time_per_record = (
341
- MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
342
- )
339
+ model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
343
340
 
344
341
  # get est. of conversion cost (in USD) per record from model card
345
342
  model_conversion_usd_per_record = (
@@ -362,74 +359,9 @@ class LLMConvert(ConvertOp):
362
359
  )
363
360
 
364
361
 
365
- class LLMConvertConventional(LLMConvert):
366
- def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
367
- """
368
- Update the cost per record and time per record estimates to account for the additional
369
- LLM calls we incur by executing one query per-field.
370
- """
371
- # get naive cost estimates from LLMConvert
372
- naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
373
-
374
- # re-compute cost per record assuming we use fewer input tokens
375
- est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS
376
- est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
377
-
378
- # increase estimates of the input and output tokens by the number of fields generated
379
- # NOTE: this may over-estimate the number of fields that need to be generated
380
- generate_field_names = []
381
- for field_name in self.output_schema.field_names():
382
- if field_name not in self.input_schema.field_names():
383
- generate_field_names.append(field_name)
384
-
385
- num_fields_to_generate = len(generate_field_names)
386
- est_num_input_tokens *= num_fields_to_generate
387
- est_num_output_tokens *= num_fields_to_generate
388
-
389
- # get est. of conversion time per record from model card;
390
- model_conversion_time_per_record = (
391
- MODEL_CARDS[self.model.value]["seconds_per_output_token"] * est_num_output_tokens
392
- )
393
-
394
- # get est. of conversion cost (in USD) per record from model card
395
- model_conversion_usd_per_record = (
396
- MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
397
- + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
398
- )
399
-
400
- # set refined estimate of time and cost per record
401
- naive_op_cost_estimates.time_per_record = model_conversion_time_per_record
402
- naive_op_cost_estimates.time_per_record_lower_bound = naive_op_cost_estimates.time_per_record
403
- naive_op_cost_estimates.time_per_record_upper_bound = naive_op_cost_estimates.time_per_record
404
- naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
405
- naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
406
- naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
407
-
408
- return naive_op_cost_estimates
409
-
410
- def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
411
- # get the set of input fields to use for the convert operation
412
- input_fields = self.get_input_fields()
413
-
414
- # construct kwargs for generation
415
- gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
416
-
417
- # generate outputs one field at a time
418
- field_answers, generation_stats_lst = {}, []
419
- for field in fields:
420
- single_field_answers, _, single_field_stats = self.generator(candidate, [field], **gen_kwargs)
421
- field_answers.update(single_field_answers)
422
- generation_stats_lst.append(single_field_stats)
423
-
424
- # aggregate generation stats into single object
425
- generation_stats = sum(generation_stats_lst)
426
-
427
- return field_answers, generation_stats
428
-
429
-
430
362
  class LLMConvertBonded(LLMConvert):
431
363
 
432
- def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
364
+ def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
433
365
  # get the set of input fields to use for the convert operation
434
366
  input_fields = self.get_input_fields()
435
367
 
@@ -437,13 +369,14 @@ class LLMConvertBonded(LLMConvert):
437
369
  gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
438
370
 
439
371
  # generate outputs for all fields in a single query
440
- field_answers, _, generation_stats = self.generator(candidate, fields, **gen_kwargs) # TODO: guarantee negative output from generator is None
372
+ field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
441
373
 
442
374
  # if there was an error for any field, execute a conventional query on that field
443
- for field, answers in field_answers.items():
444
- if answers is None:
445
- single_field_answers, _, single_field_stats = self.generator(candidate, [field], **gen_kwargs)
446
- field_answers.update(single_field_answers)
447
- generation_stats += single_field_stats
375
+ if len(field_answers) > 1:
376
+ for field_name, answers in field_answers.items():
377
+ if answers is None:
378
+ single_field_answers, _, single_field_stats, _ = self.generator(candidate, {field_name: fields[field_name]}, **gen_kwargs)
379
+ field_answers.update(single_field_answers)
380
+ generation_stats += single_field_stats
448
381
 
449
382
  return field_answers, generation_stats
@@ -93,18 +93,17 @@ class CriticAndRefineConvert(LLMConvert):
93
93
  # NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
94
94
  # execute the initial model
95
95
  original_gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
96
- field_answers, reasoning, original_gen_stats = self.generator(candidate, fields, **original_gen_kwargs)
97
- original_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
98
- original_messages = self.generator.get_messages()
96
+ field_answers, reasoning, original_gen_stats, original_messages = self.generator(candidate, fields, **original_gen_kwargs)
97
+ original_output = f"REASONING: {reasoning}\nANSWER: {field_answers}\n"
99
98
 
100
99
  # execute the critic model
101
100
  critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
102
- field_answers, reasoning, critic_gen_stats = self.critic_generator(candidate, fields, **critic_gen_kwargs)
103
- critique_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
101
+ _, reasoning, critic_gen_stats, _ = self.critic_generator(candidate, fields, json_output=False, **critic_gen_kwargs)
102
+ critique_output = f"CRITIQUE: {reasoning}\n"
104
103
 
105
104
  # execute the refinement model
106
105
  refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
107
- field_answers, reasoning, refine_gen_stats = self.refine_generator(candidate, fields, **refine_gen_kwargs)
106
+ field_answers, reasoning, refine_gen_stats, _ = self.refine_generator(candidate, fields, **refine_gen_kwargs)
108
107
 
109
108
  # compute the total generation stats
110
109
  generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats
@@ -15,6 +15,7 @@ from palimpzest.constants import (
15
15
  from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
16
16
  from palimpzest.core.elements.filters import Filter
17
17
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
18
+ from palimpzest.core.lib.fields import BooleanField
18
19
  from palimpzest.query.generators.generators import generator_factory
19
20
  from palimpzest.query.operators.physical import PhysicalOperator
20
21
  from palimpzest.utils.model_helpers import get_vision_models
@@ -96,6 +97,8 @@ class FilterOp(PhysicalOperator, ABC):
96
97
  total_output_cost=generation_stats.total_output_cost,
97
98
  llm_call_duration_secs=generation_stats.llm_call_duration_secs,
98
99
  fn_call_duration_secs=generation_stats.fn_call_duration_secs,
100
+ total_llm_calls=generation_stats.total_llm_calls,
101
+ total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
99
102
  answer=answer,
100
103
  passed_operator=passed_operator,
101
104
  image_operation=self.is_image_filter(),
@@ -248,14 +251,8 @@ class LLMFilter(FilterOp):
248
251
  # construct kwargs for generation
249
252
  gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
250
253
 
251
- # generate output
252
- field_answers, _, generation_stats = self.generator(candidate, ["passed_operator"], **gen_kwargs)
254
+ # generate output; NOTE: BooleanField is used to indicate the output type; thus, the desc is not needed
255
+ fields = {"passed_operator": BooleanField(desc="")}
256
+ field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
253
257
 
254
- # compute whether the record passed the filter or not
255
- passed_operator = False
256
- if isinstance(field_answers["passed_operator"], str):
257
- passed_operator = "true" in field_answers["passed_operator"].lower()
258
- elif isinstance(field_answers["passed_operator"], bool):
259
- passed_operator = field_answers["passed_operator"]
260
-
261
- return {"passed_operator": passed_operator}, generation_stats
258
+ return field_answers, generation_stats
@@ -24,6 +24,7 @@ class LogicalOperator:
24
24
  - GroupByAggregate (applies a group by on the Set)
25
25
  - Aggregate (applies an aggregation on the Set)
26
26
  - RetrieveScan (fetches documents from a provided input for a given query)
27
+ - Map (applies a function to each record in the Set without adding any new columns)
27
28
 
28
29
  Every logical operator must declare the get_logical_id_params() and get_logical_op_params() methods,
29
30
  which return dictionaries of parameters that are used to compute the logical op id and to implement
@@ -41,11 +42,9 @@ class LogicalOperator:
41
42
 
42
43
  # compute the fields generated by this logical operator
43
44
  input_field_names = self.input_schema.field_names() if self.input_schema is not None else []
44
- self.generated_fields = sorted([
45
- field_name
46
- for field_name in self.output_schema.field_names()
47
- if field_name not in input_field_names
48
- ])
45
+ self.generated_fields = sorted(
46
+ [field_name for field_name in self.output_schema.field_names() if field_name not in input_field_names]
47
+ )
49
48
 
50
49
  def __str__(self) -> str:
51
50
  raise NotImplementedError("Abstract method")
@@ -76,7 +75,7 @@ class LogicalOperator:
76
75
  """
77
76
  Returns a dictionary mapping of logical operator parameters which may be used to
78
77
  implement a physical operator associated with this logical operation.
79
-
78
+
80
79
  NOTE: Should be overriden by subclasses to include class-specific parameters.
81
80
  """
82
81
  return {"input_schema": self.input_schema, "output_schema": self.output_schema}
@@ -101,6 +100,10 @@ class LogicalOperator:
101
100
 
102
101
  return self.logical_op_id
103
102
 
103
+ def get_generated_fields(self) -> list[str]:
104
+ """Returns the names of the fields generated by this logical operator."""
105
+ return self.generated_fields
106
+
104
107
  def __hash__(self):
105
108
  if not self.logical_op_id:
106
109
  raise ValueError("logical_op_id not set, unable to hash")
@@ -278,6 +281,7 @@ class FilteredScan(LogicalOperator):
278
281
 
279
282
  return logical_op_params
280
283
 
284
+
281
285
  class GroupByAggregate(LogicalOperator):
282
286
  def __init__(
283
287
  self,
@@ -314,6 +318,7 @@ class GroupByAggregate(LogicalOperator):
314
318
 
315
319
  return logical_op_params
316
320
 
321
+
317
322
  class LimitScan(LogicalOperator):
318
323
  def __init__(self, limit: int, target_cache_id: str | None = None, *args, **kwargs):
319
324
  super().__init__(*args, **kwargs)
@@ -374,7 +379,7 @@ class RetrieveScan(LogicalOperator):
374
379
  index,
375
380
  search_func,
376
381
  search_attr,
377
- output_attr,
382
+ output_attrs,
378
383
  k,
379
384
  target_cache_id: str = None,
380
385
  *args,
@@ -384,7 +389,7 @@ class RetrieveScan(LogicalOperator):
384
389
  self.index = index
385
390
  self.search_func = search_func
386
391
  self.search_attr = search_attr
387
- self.output_attr = output_attr
392
+ self.output_attrs = output_attrs
388
393
  self.k = k
389
394
  self.target_cache_id = target_cache_id
390
395
 
@@ -398,7 +403,7 @@ class RetrieveScan(LogicalOperator):
398
403
  logical_id_params = super().get_logical_id_params()
399
404
  logical_id_params = {
400
405
  "search_attr": self.search_attr,
401
- "output_attr": self.output_attr,
406
+ "output_attrs": self.output_attrs,
402
407
  "k": self.k,
403
408
  **logical_id_params,
404
409
  }
@@ -411,10 +416,49 @@ class RetrieveScan(LogicalOperator):
411
416
  "index": self.index,
412
417
  "search_func": self.search_func,
413
418
  "search_attr": self.search_attr,
414
- "output_attr": self.output_attr,
419
+ "output_attrs": self.output_attrs,
415
420
  "k": self.k,
416
421
  "target_cache_id": self.target_cache_id,
417
422
  **logical_op_params,
418
423
  }
419
424
 
420
425
  return logical_op_params
426
+
427
+
428
+ # TODO: (near-term) maybe we should try to fold this into ConvertScan, and make the internals of PZ
429
+ # amenable to a convert operator (with a UDF) that does not add new columns?
430
+ class MapScan(LogicalOperator):
431
+ """A MapScan is a logical operator that applies a UDF to each input record without adding new columns."""
432
+
433
+ def __init__(
434
+ self,
435
+ udf: Callable | None = None,
436
+ target_cache_id: str | None = None,
437
+ *args,
438
+ **kwargs,
439
+ ):
440
+ super().__init__(*args, **kwargs)
441
+ self.udf = udf
442
+ self.target_cache_id = target_cache_id
443
+
444
+ def __str__(self):
445
+ return f"MapScan({self.output_schema}, {self.udf.__name__})"
446
+
447
+ def get_logical_id_params(self) -> dict:
448
+ logical_id_params = super().get_logical_id_params()
449
+ logical_id_params = {
450
+ "udf": self.udf,
451
+ **logical_id_params,
452
+ }
453
+
454
+ return logical_id_params
455
+
456
+ def get_logical_op_params(self) -> dict:
457
+ logical_op_params = super().get_logical_op_params()
458
+ logical_op_params = {
459
+ "udf": self.udf,
460
+ "target_cache_id": self.target_cache_id,
461
+ **logical_op_params,
462
+ }
463
+
464
+ return logical_op_params