PyPI - palimpzest - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.3.dist-info/RECORD +0 -87
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/code_synthesis_convert.py CHANGED Viewed

@@ -7,7 +7,7 @@ from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstima
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
 from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
-from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded, LLMConvertConventional
+from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded
 from palimpzest.utils.sandbox import API
 # TYPE DEFINITIONS
@@ -24,7 +24,7 @@ class CodeSynthesisConvert(LLMConvert):
         self,
         exemplar_generation_model: Model = Model.GPT_4o,
         code_synth_model: Model = Model.GPT_4o,
-        conventional_fallback_model: Model = Model.GPT_4o_MINI,
+        fallback_model: Model = Model.GPT_4o_MINI,
         *args,
         **kwargs,
     ):
@@ -34,7 +34,7 @@ class CodeSynthesisConvert(LLMConvert):
         # set models
         self.exemplar_generation_model = exemplar_generation_model
         self.code_synth_model = code_synth_model
-        self.conventional_fallback_model = conventional_fallback_model
+        self.fallback_model = fallback_model
         # initialize parameters
         self.field_to_code_ensemble = None
@@ -58,7 +58,7 @@ class CodeSynthesisConvert(LLMConvert):
         id_params = {
             "exemplar_generation_model": self.exemplar_generation_model.value,
             "code_synth_model": self.code_synth_model.value,
-            "conventional_fallback_model": self.conventional_fallback_model.value,
+            "fallback_model": self.fallback_model.value,
             **id_params,
         }
@@ -69,7 +69,7 @@ class CodeSynthesisConvert(LLMConvert):
         op_params = {
             "exemplar_generation_model": self.exemplar_generation_model,
             "code_synth_model": self.code_synth_model,
-            "conventional_fallback_model": self.conventional_fallback_model,
+            "fallback_model": self.fallback_model,
             **op_params,
         }
@@ -89,7 +89,7 @@ class CodeSynthesisConvert(LLMConvert):
         naive_op_cost_estimates.time_per_record = 1e-5
         naive_op_cost_estimates.time_per_record_lower_bound = 1e-5
         naive_op_cost_estimates.time_per_record_upper_bound = 1e-5
-        naive_op_cost_estimates.cost_per_record = 1e-6 # amortize code synth cost across records
+        naive_op_cost_estimates.cost_per_record = 1e-6  # amortize code synth cost across records
         naive_op_cost_estimates.cost_per_record_lower_bound = 1e-6
         naive_op_cost_estimates.cost_per_record_upper_bound = 1e-6
         naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * (GPT_4o_MODEL_CARD["code"] / 100.0) * 0.7
@@ -149,7 +149,9 @@ class CodeSynthesisConvert(LLMConvert):
         # set field_to_code_ensemble and code_synthesized to True
         return field_to_code_ensemble, generation_stats
-    def _bonded_query_fallback(self, candidate: DataRecord) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
+    def _bonded_query_fallback(
+        self, candidate: DataRecord
+    ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
         fields_to_generate = self.get_fields_to_generate(candidate)
         projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
@@ -181,7 +183,9 @@ class CodeSynthesisConvert(LLMConvert):
         """Code synthesis is disallowed on image conversions, so this must be False."""
         return False
-    def convert(self, candidate: DataRecord, fields: list[str] | None = None) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
+    def convert(
+        self, candidate: DataRecord, fields: list[str] | None = None
+    ) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
         # get the dictionary fields for the candidate
         candidate_dict = candidate.to_dict(include_bytes=False, project_cols=self.depends_on)
@@ -220,18 +224,18 @@ class CodeSynthesisConvert(LLMConvert):
                 field_answers[field_name] = [answer]
             else:
-                # if there is a failure, run a conventional query
+                # if there is a failure, run a conventional llm convert query for the field
                 if self.verbose:
                     print(f"CODEGEN FALLING BACK TO CONVENTIONAL FOR FIELD {field_name}")
-                # execute the conventional convert
-                conventional_op = LLMConvertConventional(
+                # execute the conventional llm convert
+                convert_op = LLMConvertBonded(
                     input_schema=self.input_schema,
                     output_schema=self.output_schema,
-                    model=self.conventional_fallback_model,
+                    model=self.fallback_model,
                     prompt_strategy=self.prompt_strategy,
                 )
-                single_field_answers, single_field_stats = conventional_op.convert(candidate, [field_name])
+                single_field_answers, single_field_stats = convert_op.convert(candidate, [field_name])
                 # include code execution time in single_field_stats
                 single_field_stats.fn_call_duration_secs += exec_stats.fn_call_duration_secs
@@ -318,7 +322,7 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
         gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
         # invoke the champion model to generate the code
-        pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
+        pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
         ordered_keys = [f"```{language}", f"```{language.lower()}", "```"]
         code = None
         if not pred:
@@ -337,7 +341,9 @@ class CodeSynthesisConvertSingle(CodeSynthesisConvert):
         return code, stats
-    def _synthesize_field_code(self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs):
+    def _synthesize_field_code(
+        self, candidate: DataRecord, api: API, output_field_name: str, num_exemplars: int = 1, *args, **kwargs
+    ):
         code, generation_stats = self._code_synth_single(
             candidate, api, output_field_name, exemplars=self.exemplars[:num_exemplars]
         )
@@ -354,7 +360,9 @@ class CodeSynthesisConvertExampleEnsemble(CodeSynthesisConvertSingle):
             return False
         return not self.code_synthesized
-    def _synthesize_field_code(self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs):
+    def _synthesize_field_code(
+        self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, *args, **kwargs
+    ):
         # creates an ensemble of `code_ensemble_num` synthesized functions; each of
         # which uses a different exemplar (modulo the # of exemplars) for its synthesis
         code_ensemble = {}
@@ -425,13 +433,20 @@ class CodeSynthesisConvertAdviceEnsemble(CodeSynthesisConvertSingle):
         # set prompt for generator
         gen_kwargs = {"prompt": prompt, "parse_answer": lambda text: text.split("answer:")[-1].split("---")[0].strip()}
-        pred, _, stats = self.code_champion_generator(candidate, None, **gen_kwargs)
+        pred, _, stats, _ = self.code_champion_generator(candidate, None, json_output=False, **gen_kwargs)
         advs = self._parse_multiple_outputs(pred, outputs=[f"Idea {i}" for i in range(1, limit + 1)])
         return advs, stats
     def _synthesize_field_code(
-        self, candidate: DataRecord, api: API, output_field_name: str, code_ensemble_num: int = 1, num_exemplars: int = 1, *args, **kwargs
+        self,
+        candidate: DataRecord,
+        api: API,
+        output_field_name: str,
+        code_ensemble_num: int = 1,
+        num_exemplars: int = 1,
+        *args,
+        **kwargs,
     ):
         # a more advanced approach in which advice is first solicited, and then
         # provided as context when synthesizing the code ensemble

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import time
 from abc import ABC, abstractmethod
-from typing import Any, Callable
+from typing import Callable
 from palimpzest.constants import (
     MODEL_CARDS,
@@ -15,13 +15,11 @@ from palimpzest.constants import (
 )
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.lib.fields import Field
 from palimpzest.query.generators.generators import generator_factory
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.utils.model_helpers import get_vision_models
-# TYPE DEFINITIONS
-FieldName = str
 class ConvertOp(PhysicalOperator, ABC):
     def __init__(
@@ -49,18 +47,13 @@ class ConvertOp(PhysicalOperator, ABC):
     def get_op_params(self):
         op_params = super().get_op_params()
-        op_params = {
-            "cardinality": self.cardinality,
-            "udf": self.udf,
-            "desc": self.desc,
-            **op_params
-        }
+        op_params = {"cardinality": self.cardinality, "udf": self.udf, "desc": self.desc, **op_params}
         return op_params
     def _create_data_records_from_field_answers(
         self,
-        field_answers: dict[FieldName, list[Any]],
+        field_answers: dict[str, list],
         candidate: DataRecord,
     ) -> list[DataRecord]:
         """
@@ -94,7 +87,7 @@ class ConvertOp(PhysicalOperator, ABC):
                 if field not in input_fields:
                     value = field_answers[field][idx] if idx < len(field_answers[field]) else None
                     setattr(dr, field, value)
             # append data record to list of output data records
             drs.append(dr)
@@ -103,7 +96,7 @@ class ConvertOp(PhysicalOperator, ABC):
     def _create_record_set(
         self,
         records: list[DataRecord],
-        fields: list[str],
+        field_names: list[str],
         generation_stats: GenerationStats,
         total_time: float,
         successful_convert: bool,
@@ -128,15 +121,17 @@ class ConvertOp(PhysicalOperator, ABC):
                 time_per_record=time_per_record,
                 cost_per_record=per_record_stats.cost_per_record,
                 model_name=self.get_model_name(),
-                answer={field_name: getattr(dr, field_name) for field_name in fields},
+                answer={field_name: getattr(dr, field_name) for field_name in field_names},
                 input_fields=self.input_schema.field_names(),
-                generated_fields=fields,
+                generated_fields=field_names,
                 total_input_tokens=per_record_stats.total_input_tokens,
                 total_output_tokens=per_record_stats.total_output_tokens,
                 total_input_cost=per_record_stats.total_input_cost,
                 total_output_cost=per_record_stats.total_output_cost,
                 llm_call_duration_secs=per_record_stats.llm_call_duration_secs,
                 fn_call_duration_secs=per_record_stats.fn_call_duration_secs,
+                total_llm_calls=per_record_stats.total_llm_calls,
+                total_embedding_llm_calls=per_record_stats.total_embedding_llm_calls,
                 failed_convert=(not successful_convert),
                 image_operation=self.is_image_conversion(),
                 op_details={k: str(v) for k, v in self.get_id_params().items()},
@@ -153,7 +148,7 @@ class ConvertOp(PhysicalOperator, ABC):
         pass
     @abstractmethod
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
         """
         This abstract method will be implemented by subclasses of ConvertOp to process the input DataRecord
         and generate the value(s) for each of the specified fields. If the convert operator is a one-to-many
@@ -187,7 +182,8 @@ class ConvertOp(PhysicalOperator, ABC):
         # execute the convert
         field_answers: dict[str, list]
-        field_answers, generation_stats = self.convert(candidate=candidate, fields=fields_to_generate)
+        fields = {field: field_type for field, field_type in self.output_schema.field_map().items() if field in fields_to_generate}
+        field_answers, generation_stats = self.convert(candidate=candidate, fields=fields)
         assert all([field in field_answers for field in fields_to_generate]), "Not all fields were generated!"
         # replace any None values with an empty list; subclasses may override __call__ to change this behavior
@@ -199,7 +195,7 @@ class ConvertOp(PhysicalOperator, ABC):
         # construct and return DataRecordSet
         record_set = self._create_record_set(
             records=drs,
-            fields=fields_to_generate,
+            field_names=fields_to_generate,
             generation_stats=generation_stats,
             total_time=time.time() - start_time,
             successful_convert=successful_convert,
@@ -211,7 +207,7 @@ class ConvertOp(PhysicalOperator, ABC):
 class NonLLMConvert(ConvertOp):
     def __str__(self):
         op = super().__str__()
-        op += f"    UDF: {str(self.udf)}\n"
+        op += f"    UDF: {self.udf.__name__}\n"
         return op
     def is_image_conversion(self) -> bool:
@@ -239,7 +235,7 @@ class NonLLMConvert(ConvertOp):
             quality=1.0,
         )
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
         # apply UDF to input record
         start_time = time.time()
         field_answers = {}
@@ -249,7 +245,9 @@ class NonLLMConvert(ConvertOp):
             if self.cardinality == Cardinality.ONE_TO_ONE:
                 # answer should be a dictionary
-                assert isinstance(answer, dict), "UDF must return a dictionary mapping each generated field to its value for one-to-one converts"
+                assert isinstance(answer, dict), (
+                    "UDF must return a dictionary mapping each generated field to its value for one-to-one converts"
+                )
                 # wrap each answer in a list
                 field_answers = {field_name: [answer[field_name]] for field_name in fields}
@@ -263,7 +261,7 @@ class NonLLMConvert(ConvertOp):
                         field_answers[field_name].append(answer_dict.get(field_name, None))
             if self.verbose:
-                print(f"{str(self.udf)}:\n{answer}")
+                print(f"{self.udf.__name__}:\n{answer}")
         except Exception as e:
             print(f"Error invoking user-defined function for convert: {e}")
@@ -279,6 +277,7 @@ class LLMConvert(ConvertOp):
     """
     This is the base class for convert operations which use an LLM to generate the output fields.
     """
     def __init__(
         self,
         model: Model,
@@ -337,9 +336,7 @@ class LLMConvert(ConvertOp):
         # get est. of conversion time per record from model card;
         # NOTE: model will only be None for code synthesis, which uses GPT-3.5 as fallback
         model_name = self.model.value if getattr(self, "model", None) is not None else Model.GPT_4o_MINI.value
-        model_conversion_time_per_record = (
-            MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
-        )
+        model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
         # get est. of conversion cost (in USD) per record from model card
         model_conversion_usd_per_record = (
@@ -362,74 +359,9 @@ class LLMConvert(ConvertOp):
         )
-class LLMConvertConventional(LLMConvert):
-    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
-        """
-        Update the cost per record and time per record estimates to account for the additional
-        LLM calls we incur by executing one query per-field.
-        """
-        # get naive cost estimates from LLMConvert
-        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
-        # re-compute cost per record assuming we use fewer input tokens
-        est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS
-        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
-        # increase estimates of the input and output tokens by the number of fields generated
-        # NOTE: this may over-estimate the number of fields that need to be generated
-        generate_field_names = []
-        for field_name in self.output_schema.field_names():
-            if field_name not in self.input_schema.field_names():
-                generate_field_names.append(field_name)
-        num_fields_to_generate = len(generate_field_names)
-        est_num_input_tokens *= num_fields_to_generate
-        est_num_output_tokens *= num_fields_to_generate
-        # get est. of conversion time per record from model card;
-        model_conversion_time_per_record = (
-            MODEL_CARDS[self.model.value]["seconds_per_output_token"] * est_num_output_tokens
-        )
-        # get est. of conversion cost (in USD) per record from model card
-        model_conversion_usd_per_record = (
-            MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
-            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
-        )
-        # set refined estimate of time and cost per record
-        naive_op_cost_estimates.time_per_record = model_conversion_time_per_record
-        naive_op_cost_estimates.time_per_record_lower_bound = naive_op_cost_estimates.time_per_record
-        naive_op_cost_estimates.time_per_record_upper_bound = naive_op_cost_estimates.time_per_record
-        naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
-        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
-        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
-        return naive_op_cost_estimates
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
-        # get the set of input fields to use for the convert operation
-        input_fields = self.get_input_fields()
-        # construct kwargs for generation
-        gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
-        # generate outputs one field at a time
-        field_answers, generation_stats_lst = {}, []
-        for field in fields:
-            single_field_answers, _, single_field_stats = self.generator(candidate, [field], **gen_kwargs)
-            field_answers.update(single_field_answers)
-            generation_stats_lst.append(single_field_stats)
-        # aggregate generation stats into single object
-        generation_stats = sum(generation_stats_lst)
-        return field_answers, generation_stats
 class LLMConvertBonded(LLMConvert):
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
         # get the set of input fields to use for the convert operation
         input_fields = self.get_input_fields()
@@ -437,13 +369,14 @@ class LLMConvertBonded(LLMConvert):
         gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
         # generate outputs for all fields in a single query
-        field_answers, _, generation_stats = self.generator(candidate, fields, **gen_kwargs) # TODO: guarantee negative output from generator is None
+        field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
         # if there was an error for any field, execute a conventional query on that field
-        for field, answers in field_answers.items():
-            if answers is None:
-                single_field_answers, _, single_field_stats = self.generator(candidate, [field], **gen_kwargs)
-                field_answers.update(single_field_answers)
-                generation_stats += single_field_stats
+        if len(field_answers) > 1:
+            for field_name, answers in field_answers.items():
+                if answers is None:
+                    single_field_answers, _, single_field_stats, _ = self.generator(candidate, {field_name: fields[field_name]}, **gen_kwargs)
+                    field_answers.update(single_field_answers)
+                    generation_stats += single_field_stats
         return field_answers, generation_stats

palimpzest/query/operators/critique_and_refine_convert.py CHANGED Viewed

@@ -93,18 +93,17 @@ class CriticAndRefineConvert(LLMConvert):
         # NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
         # execute the initial model
         original_gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
-        field_answers, reasoning, original_gen_stats = self.generator(candidate, fields, **original_gen_kwargs)
-        original_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
-        original_messages = self.generator.get_messages()
+        field_answers, reasoning, original_gen_stats, original_messages = self.generator(candidate, fields, **original_gen_kwargs)
+        original_output = f"REASONING: {reasoning}\nANSWER: {field_answers}\n"
         # execute the critic model
         critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
-        field_answers, reasoning, critic_gen_stats = self.critic_generator(candidate, fields, **critic_gen_kwargs)
-        critique_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
+        _, reasoning, critic_gen_stats, _ = self.critic_generator(candidate, fields, json_output=False, **critic_gen_kwargs)
+        critique_output = f"CRITIQUE: {reasoning}\n"
         # execute the refinement model
         refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
-        field_answers, reasoning, refine_gen_stats = self.refine_generator(candidate, fields, **refine_gen_kwargs)
+        field_answers, reasoning, refine_gen_stats, _ = self.refine_generator(candidate, fields, **refine_gen_kwargs)
         # compute the total generation stats
         generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats

palimpzest/query/operators/filter.py CHANGED Viewed

@@ -15,6 +15,7 @@ from palimpzest.constants import (
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.lib.fields import BooleanField
 from palimpzest.query.generators.generators import generator_factory
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.utils.model_helpers import get_vision_models
@@ -96,6 +97,8 @@ class FilterOp(PhysicalOperator, ABC):
             total_output_cost=generation_stats.total_output_cost,
             llm_call_duration_secs=generation_stats.llm_call_duration_secs,
             fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
             answer=answer,
             passed_operator=passed_operator,
             image_operation=self.is_image_filter(),
@@ -248,14 +251,8 @@ class LLMFilter(FilterOp):
         # construct kwargs for generation
         gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
-        # generate output
-        field_answers, _, generation_stats = self.generator(candidate, ["passed_operator"], **gen_kwargs)
+        # generate output; NOTE: BooleanField is used to indicate the output type; thus, the desc is not needed
+        fields = {"passed_operator": BooleanField(desc="")}
+        field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
-        # compute whether the record passed the filter or not
-        passed_operator = False
-        if isinstance(field_answers["passed_operator"], str):
-            passed_operator = "true" in field_answers["passed_operator"].lower()
-        elif isinstance(field_answers["passed_operator"], bool):
-            passed_operator = field_answers["passed_operator"]
-        return {"passed_operator": passed_operator}, generation_stats
+        return field_answers, generation_stats

palimpzest/query/operators/logical.py CHANGED Viewed

@@ -24,6 +24,7 @@ class LogicalOperator:
     - GroupByAggregate (applies a group by on the Set)
     - Aggregate (applies an aggregation on the Set)
     - RetrieveScan (fetches documents from a provided input for a given query)
+    - Map (applies a function to each record in the Set without adding any new columns)
     Every logical operator must declare the get_logical_id_params() and get_logical_op_params() methods,
     which return dictionaries of parameters that are used to compute the logical op id and to implement
@@ -41,11 +42,9 @@ class LogicalOperator:
         # compute the fields generated by this logical operator
         input_field_names = self.input_schema.field_names() if self.input_schema is not None else []
-        self.generated_fields = sorted([
-            field_name
-            for field_name in self.output_schema.field_names()
-            if field_name not in input_field_names
-        ])
+        self.generated_fields = sorted(
+            [field_name for field_name in self.output_schema.field_names() if field_name not in input_field_names]
+        )
     def __str__(self) -> str:
         raise NotImplementedError("Abstract method")
@@ -76,7 +75,7 @@ class LogicalOperator:
         """
         Returns a dictionary mapping of logical operator parameters which may be used to
         implement a physical operator associated with this logical operation.
         NOTE: Should be overriden by subclasses to include class-specific parameters.
         """
         return {"input_schema": self.input_schema, "output_schema": self.output_schema}
@@ -101,6 +100,10 @@ class LogicalOperator:
         return self.logical_op_id
+    def get_generated_fields(self) -> list[str]:
+        """Returns the names of the fields generated by this logical operator."""
+        return self.generated_fields
     def __hash__(self):
         if not self.logical_op_id:
             raise ValueError("logical_op_id not set, unable to hash")
@@ -278,6 +281,7 @@ class FilteredScan(LogicalOperator):
         return logical_op_params
 class GroupByAggregate(LogicalOperator):
     def __init__(
         self,
@@ -314,6 +318,7 @@ class GroupByAggregate(LogicalOperator):
         return logical_op_params
 class LimitScan(LogicalOperator):
     def __init__(self, limit: int, target_cache_id: str | None = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -374,7 +379,7 @@ class RetrieveScan(LogicalOperator):
         index,
         search_func,
         search_attr,
-        output_attr,
+        output_attrs,
         k,
         target_cache_id: str = None,
         *args,
@@ -384,7 +389,7 @@ class RetrieveScan(LogicalOperator):
         self.index = index
         self.search_func = search_func
         self.search_attr = search_attr
-        self.output_attr = output_attr
+        self.output_attrs = output_attrs
         self.k = k
         self.target_cache_id = target_cache_id
@@ -398,7 +403,7 @@ class RetrieveScan(LogicalOperator):
         logical_id_params = super().get_logical_id_params()
         logical_id_params = {
             "search_attr": self.search_attr,
-            "output_attr": self.output_attr,
+            "output_attrs": self.output_attrs,
             "k": self.k,
             **logical_id_params,
         }
@@ -411,10 +416,49 @@ class RetrieveScan(LogicalOperator):
             "index": self.index,
             "search_func": self.search_func,
             "search_attr": self.search_attr,
-            "output_attr": self.output_attr,
+            "output_attrs": self.output_attrs,
             "k": self.k,
             "target_cache_id": self.target_cache_id,
             **logical_op_params,
         }
         return logical_op_params
+# TODO: (near-term) maybe we should try to fold this into ConvertScan, and make the internals of PZ
+#       amenable to a convert operator (with a UDF) that does not add new columns?
+class MapScan(LogicalOperator):
+    """A MapScan is a logical operator that applies a UDF to each input record without adding new columns."""
+    def __init__(
+        self,
+        udf: Callable | None = None,
+        target_cache_id: str | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.udf = udf
+        self.target_cache_id = target_cache_id
+    def __str__(self):
+        return f"MapScan({self.output_schema}, {self.udf.__name__})"
+    def get_logical_id_params(self) -> dict:
+        logical_id_params = super().get_logical_id_params()
+        logical_id_params = {
+            "udf": self.udf,
+            **logical_id_params,
+        }
+        return logical_id_params
+    def get_logical_op_params(self) -> dict:
+        logical_op_params = super().get_logical_op_params()
+        logical_op_params = {
+            "udf": self.udf,
+            "target_cache_id": self.target_cache_id,
+            **logical_op_params,
+        }
+        return logical_op_params

palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl