palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.20.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,488 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any
4
-
5
- from palimpzest.constants import Cardinality, GPT_4o_MODEL_CARD, Model
6
- from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
7
- from palimpzest.core.elements.records import DataRecord
8
- from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
9
- from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
10
- from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
11
- from palimpzest.utils.sandbox import API
12
-
13
- # TYPE DEFINITIONS
14
- FieldName = str
15
- CodeName = str
16
- Code = str
17
- DataRecordDict = dict[str, Any]
18
- Exemplar = tuple[DataRecordDict, DataRecordDict]
19
- CodeEnsemble = dict[CodeName, Code]
20
-
21
-
22
- class CodeSynthesisConvert(LLMConvert):
23
- def __init__(
24
- self,
25
- exemplar_generation_model: Model = Model.GPT_4o,
26
- code_synth_model: Model = Model.GPT_4o,
27
- fallback_model: Model = Model.GPT_4o_MINI,
28
- *args,
29
- **kwargs,
30
- ):
31
- kwargs["model"] = None
32
- super().__init__(*args, **kwargs)
33
-
34
- # set models
35
- self.exemplar_generation_model = exemplar_generation_model
36
- self.code_synth_model = code_synth_model
37
- self.fallback_model = fallback_model
38
-
39
- # initialize parameters
40
- self.field_to_code_ensemble = None
41
- self.exemplars = []
42
- self.code_synthesized = False
43
- self.code_champion_generator = generator_factory(
44
- model=self.code_synth_model,
45
- prompt_strategy=self.prompt_strategy,
46
- cardinality=Cardinality.ONE_TO_ONE,
47
- verbose=self.verbose,
48
- )
49
- self.field_to_code_ensemble = {}
50
-
51
- def __str__(self):
52
- op = super().__str__()
53
- op += f" Code Synth Strategy: {self.__class__.__name__}\n"
54
- return op
55
-
56
- def get_id_params(self):
57
- id_params = super().get_id_params()
58
- id_params = {
59
- "exemplar_generation_model": self.exemplar_generation_model.value,
60
- "code_synth_model": self.code_synth_model.value,
61
- "fallback_model": self.fallback_model.value,
62
- **id_params,
63
- }
64
-
65
- return id_params
66
-
67
- def get_op_params(self):
68
- op_params = super().get_op_params()
69
- op_params = {
70
- "exemplar_generation_model": self.exemplar_generation_model,
71
- "code_synth_model": self.code_synth_model,
72
- "fallback_model": self.fallback_model,
73
- **op_params,
74
- }
75
-
76
- return op_params
77
-
78
- def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
79
- """
80
- Currently we are using GPT-4 to generate code which we can then invoke on subsequent
81
- inputs to this operator. To reflect this in our naive cost estimates, we assume that
82
- the time_per_record is low (about the time it takes to execute a cheap python function)
83
- and that the cost_per_record is also low (we amortize the cost of code generation across
84
- all records). For our quality estimate, we naively assume some degredation in quality.
85
- In practice, this naive quality estimate will be overwritten by the CostModel's estimate
86
- once it executes a few code generated examples.
87
- """
88
- naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
89
- naive_op_cost_estimates.time_per_record = 1e-5
90
- naive_op_cost_estimates.time_per_record_lower_bound = 1e-5
91
- naive_op_cost_estimates.time_per_record_upper_bound = 1e-5
92
- naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
93
- naive_op_cost_estimates.cost_per_record_lower_bound = 1e-6
94
- naive_op_cost_estimates.cost_per_record_upper_bound = 1e-6
95
- naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * (GPT_4o_MODEL_CARD["code"] / 100.0) * 0.7
96
- naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
97
- naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
98
-
99
- return naive_op_cost_estimates
100
-
101
- def _should_synthesize(
102
- self, exemplars: list[Exemplar], num_exemplars: int = 1, code_regenerate_frequency: int = 200, *args, **kwargs
103
- ) -> bool:
104
- """This function determines whether code synthesis should be performed based on the strategy and the number of exemplars available."""
105
- raise NotImplementedError("This method should be implemented in a subclass")
106
-
107
- def _synthesize_field_code(
108
- self,
109
- candidate: DataRecord,
110
- api: API,
111
- output_field_name: str,
112
- code_ensemble_num: int = 1, # if strategy != SINGLE
113
- num_exemplars: int = 1, # if strategy != EXAMPLE_ENSEMBLE
114
- ) -> tuple[dict[CodeName, Code], GenerationStats]:
115
- """This method is responsible for synthesizing the code on a per-field basis.
116
- Wrapping different calls to the LLM and returning a set of per-field query statistics.
117
- The format of the code ensemble dictionary is {code_name: code} where code_name is a string and code is a string representing the code.
118
- """
119
- raise NotImplementedError("This method should be implemented in a subclass")
120
-
121
- def synthesize_code_ensemble(self, fields_to_generate, candidate: DataRecord, *args, **kwargs):
122
- """This function is a wrapper around specific code synthesis methods
123
- that wraps the synthesized code per-field in a dictionary and returns the stats object.
124
- """
125
- # synthesize the per-field code ensembles
126
- field_to_code_ensemble = {}
127
- generation_stats = GenerationStats()
128
- for field_name in fields_to_generate:
129
- api = API.from_input_output_schemas(
130
- input_schema=self.input_schema,
131
- output_schema=self.output_schema,
132
- field_name=field_name,
133
- input_fields=candidate.get_field_names(),
134
- )
135
-
136
- # TODO here _synthesize_code should be called with the right parameters per-code-strategy?!
137
- code_ensemble, code_synth_stats = self._synthesize_field_code(candidate, api, field_name)
138
-
139
- # update mapping from fields to code ensemble and generation stats
140
- field_to_code_ensemble[field_name] = code_ensemble
141
- generation_stats += code_synth_stats
142
-
143
- if self.verbose:
144
- for code_name, code in code_ensemble.items():
145
- print(f"CODE NAME: {code_name}")
146
- print("-----------------------")
147
- print(code)
148
-
149
- # set field_to_code_ensemble and code_synthesized to True
150
- return field_to_code_ensemble, generation_stats
151
-
152
- def _bonded_query_fallback(
153
- self, candidate: DataRecord
154
- ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
155
- fields_to_generate = self.get_fields_to_generate(candidate)
156
- projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
157
-
158
- # execute the bonded convert
159
- bonded_op = LLMConvertBonded(
160
- input_schema=self.input_schema,
161
- output_schema=self.output_schema,
162
- model=self.exemplar_generation_model,
163
- prompt_strategy=self.prompt_strategy,
164
- )
165
- field_answers, generation_stats = bonded_op.convert(projected_candidate, fields_to_generate)
166
- assert all([field in field_answers for field in fields_to_generate]), "Not all fields were generated!"
167
-
168
- # for the vanilla LLMConvert, we simply replace any None values with an empty list
169
- field_answers = {field: [] if answers is None else answers for field, answers in field_answers.items()}
170
-
171
- # transform the mapping from fields to answers into a (list of) DataRecord(s)
172
- drs, _ = self._create_data_records_from_field_answers(field_answers, candidate)
173
-
174
- # NOTE: this now includes bytes input fields which will show up as: `field_name = "<bytes>"`;
175
- # keep an eye out for a regression in code synth performance and revert if necessary
176
- # update operator's set of exemplars
177
- exemplars = [(projected_candidate.to_dict(include_bytes=False), dr.to_dict(include_bytes=False)) for dr in drs]
178
- self.exemplars.extend(exemplars)
179
-
180
- return field_answers, generation_stats
181
-
182
- def is_image_conversion(self):
183
- """Code synthesis is disallowed on image conversions, so this must be False."""
184
- return False
185
-
186
- def convert(
187
- self, candidate: DataRecord, fields: list[str] | None = None
188
- ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
189
- # get the dictionary fields for the candidate
190
- candidate_dict = candidate.to_dict(include_bytes=False, project_cols=self.depends_on)
191
-
192
- # Check if code was already synthesized, or if we have at least one converted sample
193
- generation_stats = GenerationStats()
194
- if self._should_synthesize():
195
- self.field_to_code_ensemble, total_code_synth_stats = self.synthesize_code_ensemble(fields, candidate)
196
- self.code_synthesized = True
197
- generation_stats += total_code_synth_stats
198
-
199
- # if we have yet to synthesize code (perhaps b/c we are waiting for more exemplars),
200
- # use the exemplar generation model to perform the convert (and generate high-quality
201
- # exemplars) using a bonded query
202
- if not len(self.field_to_code_ensemble):
203
- return self._bonded_query_fallback(candidate)
204
-
205
- # if we have synthesized code run it on each field
206
- field_answers = {}
207
- for field_name in fields:
208
- # create api instance for executing python code
209
- api = API.from_input_output_schemas(
210
- input_schema=self.input_schema,
211
- output_schema=self.output_schema,
212
- field_name=field_name,
213
- input_fields=candidate_dict.keys(),
214
- )
215
- code_ensemble = self.field_to_code_ensemble[field_name]
216
-
217
- # execute the code ensemble to get the answer
218
- answer, _, exec_stats = code_ensemble_execution(api, code_ensemble, candidate_dict)
219
-
220
- # if the answer is not None, update the field_answers
221
- # NOTE: the answer will not be a list because code synth. is disallowed for one-to-many converts
222
- if answer is not None:
223
- generation_stats += exec_stats
224
- field_answers[field_name] = [answer]
225
-
226
- else:
227
- # if there is a failure, run a conventional llm convert query for the field
228
- if self.verbose:
229
- print(f"CODEGEN FALLING BACK TO CONVENTIONAL FOR FIELD {field_name}")
230
-
231
- # execute the conventional llm convert
232
- convert_op = LLMConvertBonded(
233
- input_schema=self.input_schema,
234
- output_schema=self.output_schema,
235
- model=self.fallback_model,
236
- prompt_strategy=self.prompt_strategy,
237
- )
238
- single_field_answers, single_field_stats = convert_op.convert(candidate, [field_name])
239
-
240
- # include code execution time in single_field_stats
241
- single_field_stats.fn_call_duration_secs += exec_stats.fn_call_duration_secs
242
-
243
- # update generation_stats
244
- generation_stats += single_field_stats
245
-
246
- # update field answers
247
- # NOTE: because code synth. is disallowed for one-to-many queries, we make the first answer a singleton
248
- field_answers[field_name] = (
249
- [single_field_answers[field_name][0]]
250
- if single_field_answers[field_name] is not None and len(single_field_answers[field_name]) > 0
251
- else []
252
- )
253
-
254
- assert all([field in field_answers for field in fields]), "Not all fields were generated!"
255
-
256
- # for the vanilla LLMConvert, we simply replace any None values with an empty list
257
- field_answers = {field: [] if answers is None else answers for field, answers in field_answers.items()}
258
-
259
- return field_answers, generation_stats
260
-
261
-
262
- class CodeSynthesisConvertNone(CodeSynthesisConvert):
263
- def _should_synthesize(self, *args, **kwargs):
264
- return False
265
-
266
- def _synthesize_field_code(self, candidate: DataRecord, api: API, *args, **kwargs):
267
- code = api.api_def() + " return None\n"
268
- code_ensemble = {"{api.name}_v0": code}
269
- return code_ensemble, GenerationStats()
270
-
271
-
272
- class CodeSynthesisConvertSingle(CodeSynthesisConvert):
273
- def _should_synthesize(self, num_exemplars: int = 1, *args, **kwargs) -> bool:
274
- """This function determines whether code synthesis
275
- should be performed based on the strategy and the number of exemplars available."""
276
- if len(self.exemplars) < num_exemplars:
277
- return False
278
- return not self.code_synthesized
279
-
280
- def _code_synth_single(
281
- self,
282
- candidate: DataRecord,
283
- api: API,
284
- output_field_name: str,
285
- exemplars: list[Exemplar] | None = None,
286
- advice: str | None = None,
287
- language="Python",
288
- ):
289
- if exemplars is None:
290
- exemplars = []
291
-
292
- context = {
293
- "language": language,
294
- "api": api.args_call(),
295
- "output": api.output,
296
- "inputs_desc": "\n".join(
297
- [f"- {field_name} ({api.input_descs[i]})" for i, field_name in enumerate(api.inputs)]
298
- ),
299
- "output_desc": api.output_desc,
300
- "examples_desc": "\n".join(
301
- [
302
- EXAMPLE_PROMPT.format(
303
- idx=f" {i}",
304
- example_inputs="\n".join(
305
- [f"- {field_name} = {repr(example[0][field_name])}" for field_name in example[0]]
306
- ),
307
- example_output=f"{example[1][output_field_name]}",
308
- )
309
- for i, example in enumerate(exemplars)
310
- ]
311
- ),
312
- "advice": f"Hint: {advice}" if advice else "",
313
- }
314
-
315
- prompt = CODEGEN_PROMPT.format(**context)
316
- if self.verbose:
317
- print("PROMPT")
318
- print("-------")
319
- print(f"{prompt}")
320
-
321
- # set prompt for generator
322
- gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
323
-
324
- # invoke the champion model to generate the code
325
- pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
326
- ordered_keys = [f"```{language}", f"```{language.lower()}", "```"]
327
- code = None
328
- if not pred:
329
- return code, stats
330
-
331
- for key in ordered_keys:
332
- if key in pred:
333
- code = pred.split(key)[1].split("```")[0].strip()
334
- break
335
-
336
- if self.verbose:
337
- print("-------")
338
- print("SYNTHESIZED CODE")
339
- print("---------------")
340
- print(f"{code}")
341
-
342
- return code, stats
343
-
344
- def _synthesize_field_code(
345
- self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs
346
- ):
347
- code, generation_stats = self._code_synth_single(
348
- candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars]
349
- )
350
- code_ensemble = {f"{api.name}_v0": code}
351
- return code_ensemble, generation_stats
352
-
353
-
354
- # NOTE A nicer truly class based approach would re-implement the code_synth_single method with calls to
355
- # __super__ and then only re-implement the differences instead of having the code in the superclass know
356
- # about the subclass-specific parameters (i.e., advice).
357
- class CodeSynthesisConvertExampleEnsemble(CodeSynthesisConvertSingle):
358
- def _should_synthesize(self, num_exemplars: int = 1, *args, **kwargs) -> bool:
359
- if len(self.exemplars) < num_exemplars:
360
- return False
361
- return not self.code_synthesized
362
-
363
- def _synthesize_field_code(
364
- self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs
365
- ):
366
- # creates an ensemble of `code_ensemble_num` synthesized functions; each of
367
- # which uses a different exemplar (modulo the # of exemplars) for its synthesis
368
- code_ensemble = {}
369
- generation_stats = GenerationStats()
370
- for i in range(code_ensemble_num):
371
- code_name = f"{api.name}_v{i}"
372
- exemplar = self.exemplars[i % len(self.exemplars)]
373
- code, stats = self._code_synth_single(candidate, api, output_field_name, exemplars=[exemplar])
374
- code_ensemble[code_name] = code
375
- generation_stats += stats
376
-
377
- return code_ensemble, generation_stats
378
-
379
-
380
- class CodeSynthesisConvertAdviceEnsemble(CodeSynthesisConvertSingle):
381
- def _should_synthesize(self, *args, **kwargs):
382
- return False
383
-
384
- def _parse_multiple_outputs(self, text, outputs=None):
385
- if outputs is None:
386
- outputs = ["Thought", "Action"]
387
- data = {}
388
- for key in reversed(outputs):
389
- if key + ":" in text:
390
- remain, value = text.rsplit(key + ":", 1)
391
- data[key.lower()] = value.strip()
392
- text = remain
393
- else:
394
- data[key.lower()] = None
395
- return data
396
-
397
- def _synthesize_advice(
398
- self,
399
- candidate: DataRecord,
400
- api: API,
401
- output_field_name: str,
402
- exemplars: list[Exemplar] | None = None,
403
- language="Python",
404
- n_advices=4,
405
- limit: int = 3,
406
- ):
407
- if exemplars is None:
408
- exemplars = []
409
- context = {
410
- "language": language,
411
- "api": api.args_call(),
412
- "output": api.output,
413
- "inputs_desc": "\n".join(
414
- [f"- {field_name} ({api.input_descs[i]})" for i, field_name in enumerate(api.inputs)]
415
- ),
416
- "output_desc": api.output_desc,
417
- "examples_desc": "\n".join(
418
- [
419
- EXAMPLE_PROMPT.format(
420
- idx=f" {i}",
421
- example_inputs="\n".join(
422
- [f"- {field_name} = {repr(example[0][field_name])}" for field_name in example[0]]
423
- ),
424
- example_output=f"{example[1][output_field_name]}",
425
- )
426
- for i, example in enumerate(exemplars)
427
- ]
428
- ),
429
- "n": n_advices,
430
- }
431
- prompt = ADVICEGEN_PROMPT.format(**context)
432
-
433
- # set prompt for generator
434
- gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
435
-
436
- pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
437
- advs = self._parse_multiple_outputs(pred, outputs=[f"Idea {i}" for i in range(1, limit + 1)])
438
-
439
- return advs, stats
440
-
441
- def _synthesize_field_code(
442
- self,
443
- candidate: DataRecord,
444
- api: API,
445
- output_field_name: str,
446
- code_ensemble_num: int = 1,
447
- num_exemplars: int = 1,
448
- *args,
449
- **kwargs,
450
- ):
451
- # a more advanced approach in which advice is first solicited, and then
452
- # provided as context when synthesizing the code ensemble
453
- output_stats = {}
454
- # solicit advice
455
- advices, adv_stats = self._synthesize_advice(
456
- candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars], n_advices=code_ensemble_num
457
- )
458
- for key, value in adv_stats.items():
459
- if isinstance(value, dict):
460
- for k2, v2 in value.items():
461
- output_stats[k2] = output_stats.get(k2, 0) + v2
462
- else:
463
- output_stats[key] += output_stats.get(key, type(value)()) + value
464
-
465
- code_ensemble = {}
466
- # synthesize code ensemble
467
- for i, adv in enumerate(advices):
468
- code_name = f"{api.name}_v{i}"
469
- code, stats = self._code_synth_single(
470
- candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars], advice=adv
471
- )
472
- code_ensemble[code_name] = code
473
- for key in output_stats:
474
- output_stats[key] += stats[key]
475
- return code_ensemble, output_stats
476
-
477
-
478
- class CodeSynthesisConvertAdviceEnsembleValidation(CodeSynthesisConvert):
479
- def _should_synthesize(self, code_regenerate_frequency: int = 200, *args, **kwargs):
480
- return len(self.exemplars) % code_regenerate_frequency == 0
481
-
482
- def _synthesize_field_code(
483
- self, api: API, output_field_name: str, exemplars: list[Exemplar] = None, *args, **kwargs
484
- ):
485
- # TODO this was not implemented ?
486
- if exemplars is None:
487
- exemplars = []
488
- raise Exception("not implemented yet")
@@ -1,130 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import time
4
- from typing import Callable
5
-
6
- from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
7
- from palimpzest.core.elements.records import DataRecord, DataRecordSet
8
- from palimpzest.core.lib.fields import Field
9
- from palimpzest.query.operators.physical import PhysicalOperator
10
-
11
-
12
- class MapOp(PhysicalOperator):
13
- def __init__(self, udf: Callable | None = None, *args, **kwargs):
14
- super().__init__(*args, **kwargs)
15
- self.udf = udf
16
-
17
- def __str__(self):
18
- op = super().__str__()
19
- op += f" UDF: {self.udf.__name__}\n"
20
- return op
21
-
22
- def get_id_params(self):
23
- id_params = super().get_id_params()
24
- id_params = {"udf": self.udf, **id_params}
25
-
26
- return id_params
27
-
28
- def get_op_params(self):
29
- op_params = super().get_op_params()
30
- op_params = {"udf": self.udf, **op_params}
31
-
32
- return op_params
33
-
34
- def _create_record_set(
35
- self,
36
- record: DataRecord,
37
- generation_stats: GenerationStats,
38
- total_time: float,
39
- ) -> DataRecordSet:
40
- """
41
- Given an input DataRecord and a determination of whether it passed the filter or not,
42
- construct the resulting RecordSet.
43
- """
44
- # create RecordOpStats object
45
- record_op_stats = RecordOpStats(
46
- record_id=record.id,
47
- record_parent_id=record.parent_id,
48
- record_source_idx=record.source_idx,
49
- record_state=record.to_dict(include_bytes=False),
50
- full_op_id=self.get_full_op_id(),
51
- logical_op_id=self.logical_op_id,
52
- op_name=self.op_name(),
53
- time_per_record=total_time,
54
- cost_per_record=0.0,
55
- fn_call_duration_secs=generation_stats.fn_call_duration_secs,
56
- answer=record.to_dict(include_bytes=False),
57
- op_details={k: str(v) for k, v in self.get_id_params().items()},
58
- )
59
-
60
- return DataRecordSet([record], [record_op_stats])
61
-
62
- def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
63
- """
64
- Compute naive cost estimates for the Map operation. These estimates assume that the map UDF
65
- (1) has no cost and (2) has perfect quality.
66
- """
67
- # estimate 1 ms single-threaded execution for udf function
68
- time_per_record = 0.001
69
-
70
- # assume filter fn has perfect quality
71
- return OperatorCostEstimates(
72
- cardinality=source_op_cost_estimates.cardinality,
73
- time_per_record=time_per_record,
74
- cost_per_record=0.0,
75
- quality=1.0,
76
- )
77
-
78
- def map(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
79
- # apply UDF to input record
80
- start_time = time.time()
81
- field_answers = {}
82
- try:
83
- # execute the UDF function
84
- field_answers = self.udf(candidate.to_dict())
85
-
86
- # answer should be a dictionary
87
- assert isinstance(field_answers, dict), (
88
- "UDF must return a dictionary mapping each input field to its value for map operations"
89
- )
90
-
91
- if self.verbose:
92
- print(f"{self.udf.__name__}")
93
-
94
- except Exception as e:
95
- print(f"Error invoking user-defined function for map: {e}")
96
- raise e
97
-
98
- # create generation stats object containing the time spent executing the UDF function
99
- generation_stats = GenerationStats(fn_call_duration_secs=time.time() - start_time)
100
-
101
- return field_answers, generation_stats
102
-
103
-
104
- def __call__(self, candidate: DataRecord) -> DataRecordSet:
105
- """
106
- This method converts an input DataRecord into an output DataRecordSet. The output DataRecordSet contains the
107
- DataRecord(s) output by the operator's convert() method and their corresponding RecordOpStats objects.
108
- Some subclasses may override this __call__method to implement their own custom logic.
109
- """
110
- start_time = time.time()
111
-
112
- # execute the map operation
113
- field_answers: dict[str, list]
114
- fields = {field: field_type for field, field_type in self.output_schema.field_map().items()}
115
- field_answers, generation_stats = self.map(candidate=candidate, fields=fields)
116
- assert all([field in field_answers for field in fields]), "Not all fields are present in output of map!"
117
-
118
- # construct DataRecord from field_answers
119
- dr = DataRecord.from_parent(schema=self.output_schema, parent_record=candidate)
120
- for field_name, field_value in field_answers.items():
121
- dr[field_name] = field_value
122
-
123
- # construct and return DataRecordSet
124
- record_set = self._create_record_set(
125
- record=dr,
126
- generation_stats=generation_stats,
127
- total_time=time.time() - start_time,
128
- )
129
-
130
- return record_set
@@ -1,33 +0,0 @@
1
- import logging
2
-
3
- from palimpzest.core.data.dataclasses import ExecutionStats
4
- from palimpzest.core.elements.records import DataRecordCollection
5
- from palimpzest.query.processor.query_processor import QueryProcessor
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
- class NoSentinelQueryProcessor(QueryProcessor):
10
- """
11
- Query processor that uses naive cost estimates to select the best plan.
12
- """
13
-
14
- # TODO: Consider to support dry_run.
15
- def execute(self) -> DataRecordCollection:
16
- logger.info("Executing NoSentinelQueryProcessor")
17
-
18
- # create execution stats
19
- execution_stats = ExecutionStats(execution_id=self.execution_id())
20
- execution_stats.start()
21
-
22
- # execute plan(s) according to the optimization strategy
23
- records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
24
-
25
- # update the execution stats to account for the work to execute the final plan
26
- execution_stats.add_plan_stats(plan_stats)
27
- execution_stats.finish()
28
-
29
- # construct and return the DataRecordCollection
30
- result = DataRecordCollection(records, execution_stats=execution_stats)
31
- logger.info("Done executing NoSentinelQueryProcessor")
32
-
33
- return result
@@ -1,28 +0,0 @@
1
- from enum import Enum
2
-
3
- from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
4
- from palimpzest.query.processor.nosentinel_processor import NoSentinelQueryProcessor
5
- from palimpzest.query.processor.sentinel_processor import SentinelQueryProcessor
6
- from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
7
-
8
-
9
- class ProcessingStrategyType(Enum):
10
- """How to generate and optimize query plans"""
11
- SENTINEL = SentinelQueryProcessor
12
- NO_SENTINEL = NoSentinelQueryProcessor
13
- STREAMING = StreamingQueryProcessor
14
-
15
- def valid_execution_strategies(self) -> list[ExecutionStrategyType]:
16
- """
17
- Returns a list of valid execution strategies for the given processing strategy.
18
- """
19
- if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
20
- return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
21
- elif self == ProcessingStrategyType.STREAMING:
22
- return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
23
-
24
- def is_sentinel_strategy(self) -> bool:
25
- """
26
- Returns True if the query processor associated with this strategy uses sentinel execution.
27
- """
28
- return self == ProcessingStrategyType.SENTINEL