PyPI - palimpzest - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

palimpzest 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

palimpzest/constants.py +38 -62
palimpzest/core/data/iter_dataset.py +5 -5
palimpzest/core/elements/groupbysig.py +1 -1
palimpzest/core/elements/records.py +91 -109
palimpzest/core/lib/schemas.py +23 -0
palimpzest/core/models.py +3 -3
palimpzest/prompts/__init__.py +2 -6
palimpzest/prompts/convert_prompts.py +10 -66
palimpzest/prompts/critique_and_refine_prompts.py +66 -0
palimpzest/prompts/filter_prompts.py +8 -46
palimpzest/prompts/join_prompts.py +12 -75
palimpzest/prompts/{moa_aggregator_convert_prompts.py → moa_aggregator_prompts.py} +51 -2
palimpzest/prompts/moa_proposer_prompts.py +87 -0
palimpzest/prompts/prompt_factory.py +351 -479
palimpzest/prompts/split_merge_prompts.py +51 -2
palimpzest/prompts/split_proposer_prompts.py +48 -16
palimpzest/prompts/utils.py +109 -0
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/mab_execution_strategy.py +1 -2
palimpzest/query/execution/parallel_execution_strategy.py +3 -3
palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
palimpzest/query/generators/generators.py +31 -17
palimpzest/query/operators/__init__.py +15 -2
palimpzest/query/operators/aggregate.py +21 -19
palimpzest/query/operators/compute.py +6 -8
palimpzest/query/operators/convert.py +12 -37
palimpzest/query/operators/critique_and_refine.py +194 -0
palimpzest/query/operators/distinct.py +7 -7
palimpzest/query/operators/filter.py +13 -25
palimpzest/query/operators/join.py +321 -192
palimpzest/query/operators/limit.py +4 -4
palimpzest/query/operators/mixture_of_agents.py +246 -0
palimpzest/query/operators/physical.py +25 -2
palimpzest/query/operators/project.py +4 -4
palimpzest/query/operators/{rag_convert.py → rag.py} +202 -5
palimpzest/query/operators/retrieve.py +10 -9
palimpzest/query/operators/scan.py +9 -10
palimpzest/query/operators/search.py +18 -24
palimpzest/query/operators/split.py +321 -0
palimpzest/query/optimizer/__init__.py +12 -8
palimpzest/query/optimizer/optimizer.py +12 -10
palimpzest/query/optimizer/rules.py +201 -108
palimpzest/query/optimizer/tasks.py +18 -6
palimpzest/validator/validator.py +7 -9
{palimpzest-0.8.2.dist-info → palimpzest-0.8.4.dist-info}/METADATA +3 -8
palimpzest-0.8.4.dist-info/RECORD +95 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
palimpzest/prompts/util_phrases.py +0 -19
palimpzest/query/operators/critique_and_refine_convert.py +0 -113
palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
palimpzest/query/operators/split_convert.py +0 -170
palimpzest-0.8.2.dist-info/RECORD +0 -95
{palimpzest-0.8.2.dist-info → palimpzest-0.8.4.dist-info}/WHEEL +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.4.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.4.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/aggregate.py CHANGED Viewed

@@ -113,18 +113,20 @@ class ApplyGroupByOp(AggregateOp):
         group_by_fields = self.group_by_sig.group_by_fields
         agg_fields = self.group_by_sig.get_agg_field_names()
         for g in agg_state:
-            dr = DataRecord.from_agg_parents(
-                schema=self.group_by_sig.output_schema(),
-                parent_records=candidates,
-            )
+            # build up data item
+            data_item = {}
             for i in range(0, len(g)):
                 k = g[i]
-                setattr(dr, group_by_fields[i], k)
+                data_item[group_by_fields[i]] = k
             vals = agg_state[g]
             for i in range(0, len(vals)):
                 v = ApplyGroupByOp.agg_final(self.group_by_sig.agg_funcs[i], vals[i])
-                setattr(dr, agg_fields[i], v)
+                data_item[agg_fields[i]] = v
+            # create new DataRecord
+            schema = self.group_by_sig.output_schema()
+            data_item = schema(**data_item)
+            dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
             drs.append(dr)
         # create RecordOpStats objects
@@ -132,9 +134,9 @@ class ApplyGroupByOp(AggregateOp):
         record_op_stats_lst = []
         for dr in drs:
             record_op_stats = RecordOpStats(
-                record_id=dr.id,
-                record_parent_ids=dr.parent_ids,
-                record_source_indices=dr.source_indices,
+                record_id=dr._id,
+                record_parent_ids=dr._parent_ids,
+                record_source_indices=dr._source_indices,
                 record_state=dr.to_dict(include_bytes=False),
                 full_op_id=self.get_full_op_id(),
                 logical_op_id=self.logical_op_id,
@@ -197,7 +199,6 @@ class AverageAggregateOp(AggregateOp):
         # NOTE: right now we perform a check in the constructor which enforces that the input_schema
         #       has a single field which is numeric in nature; in the future we may want to have a
         #       cleaner way of computing the value (rather than `float(list(candidate...))` below)
-        dr = DataRecord.from_agg_parents(schema=Average, parent_records=candidates)
         summation, total = 0, 0
         for candidate in candidates:
             try:
@@ -205,13 +206,14 @@ class AverageAggregateOp(AggregateOp):
                 total += 1
             except Exception:
                 pass
-        dr.average = summation / total
+        data_item = Average(average=summation / total)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
@@ -260,14 +262,14 @@ class CountAggregateOp(AggregateOp):
         start_time = time.time()
         # create new DataRecord
-        dr = DataRecord.from_agg_parents(schema=Count, parent_records=candidates)
-        dr.count = len(candidates)
+        data_item = Count(count=len(candidates))
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/compute.py CHANGED Viewed

@@ -93,17 +93,15 @@ class SmolAgentsCompute(PhysicalOperator):
         Given an input DataRecord and a determination of whether it passed the filter or not,
         construct the resulting RecordSet.
         """
-        # create new DataRecord and set passed_operator attribute
-        dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
-        for field in self.output_schema.model_fields:
-            if field in answer:
-                dr[field] = answer[field]
+        # create new DataRecord
+        data_item = {field: answer[field] for field in self.output_schema.model_fields if field in answer}
+        dr = DataRecord.from_parent(self.output_schema, data_item, parent_record=candidate)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -74,25 +74,14 @@ class ConvertOp(PhysicalOperator, ABC):
         drs = []
         for idx in range(max(n_records, 1)):
-            # initialize record with the correct output schema, parent record, and cardinality idx
-            dr = DataRecord.from_parent(self.output_schema, parent_record=candidate, cardinality_idx=idx)
-            # copy all fields from the input record
-            # NOTE: this means that records processed by PZ converts will inherit all pre-computed fields
-            #       in an incremental fashion; this is a design choice which may be revisited in the future
-            for field in candidate.get_field_names():
-                setattr(dr, field, getattr(candidate, field))
-            # get input field names and output field names
-            input_fields = list(self.input_schema.model_fields)
-            output_fields = list(self.output_schema.model_fields)
             # parse newly generated fields from the field_answers dictionary for this field; if the list
             # of generated values is shorter than the number of records, we fill in with None
-            for field in output_fields:
-                if field not in input_fields:
-                    value = field_answers[field][idx] if idx < len(field_answers[field]) else None
-                    setattr(dr, field, value)
+            data_item = {}
+            for field in self.generated_fields:
+                data_item[field] = field_answers[field][idx] if idx < len(field_answers[field]) else None
+            # initialize record with the correct output schema, data_item, parent record, and cardinality idx
+            dr = DataRecord.from_parent(self.output_schema, data_item, parent_record=candidate, cardinality_idx=idx)
             # append data record to list of output data records
             drs.append(dr)
@@ -117,9 +106,9 @@ class ConvertOp(PhysicalOperator, ABC):
         # create the RecordOpStats objects for each output record
         record_op_stats_lst = [
             RecordOpStats(
-                record_id=dr.id,
-                record_parent_ids=dr.parent_ids,
-                record_source_indices=dr.source_indices,
+                record_id=dr._id,
+                record_parent_ids=dr._parent_ids,
+                record_source_indices=dr._source_indices,
                 record_state=dr.to_dict(include_bytes=False),
                 full_op_id=self.get_full_op_id(),
                 logical_op_id=self.logical_op_id,
@@ -127,7 +116,7 @@ class ConvertOp(PhysicalOperator, ABC):
                 time_per_record=time_per_record,
                 cost_per_record=per_record_stats.cost_per_record,
                 model_name=self.get_model_name(),
-                answer={field_name: getattr(dr, field_name) for field_name in field_names},
+                answer={field_name: getattr(dr, field_name, None) for field_name in field_names},
                 input_fields=list(self.input_schema.model_fields),
                 generated_fields=field_names,
                 total_input_tokens=per_record_stats.total_input_tokens,
@@ -139,7 +128,6 @@ class ConvertOp(PhysicalOperator, ABC):
                 total_llm_calls=per_record_stats.total_llm_calls,
                 total_embedding_llm_calls=per_record_stats.total_embedding_llm_calls,
                 failed_convert=(not successful_convert),
-                image_operation=self.is_image_conversion(),
                 op_details={k: str(v) for k, v in self.get_id_params().items()},
             )
             for dr in records
@@ -148,11 +136,6 @@ class ConvertOp(PhysicalOperator, ABC):
         # create and return the DataRecordSet
         return DataRecordSet(records, record_op_stats_lst)
-    @abstractmethod
-    def is_image_conversion(self) -> bool:
-        """Return True if the convert operation processes an image, False otherwise."""
-        pass
     @abstractmethod
     def convert(self, candidate: DataRecord, fields: dict[str, FieldInfo]) -> tuple[dict[str, list], GenerationStats]:
         """
@@ -216,11 +199,6 @@ class NonLLMConvert(ConvertOp):
         op += f"    UDF: {self.udf.__name__}\n"
         return op
-    def is_image_conversion(self) -> bool:
-        # NOTE: even if the UDF is processing an image, we do not consider this an image conversion
-        # (the output of this function will be used by the CostModel in a way which does not apply to UDFs)
-        return False
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
         """
         Compute naive cost estimates for the NonLLMConvert operation. These estimates assume
@@ -287,7 +265,7 @@ class LLMConvert(ConvertOp):
     def __init__(
         self,
         model: Model,
-        prompt_strategy: PromptStrategy = PromptStrategy.COT_QA,
+        prompt_strategy: PromptStrategy = PromptStrategy.MAP,
         reasoning_effort: str | None = None,
         *args,
         **kwargs,
@@ -330,9 +308,6 @@ class LLMConvert(ConvertOp):
     def get_model_name(self):
         return None if self.model is None else self.model.value
-    def is_image_conversion(self) -> bool:
-        return self.prompt_strategy.is_image_prompt()
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
         """
         Compute naive cost estimates for the LLMConvert operation. Implicitly, these estimates
@@ -350,7 +325,7 @@ class LLMConvert(ConvertOp):
         # get est. of conversion cost (in USD) per record from model card
         usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token")
-        if getattr(self, "prompt_strategy", None) is not None and self.prompt_strategy.is_audio_prompt():
+        if getattr(self, "prompt_strategy", None) is not None and self.is_audio_op():
             usd_per_input_token = MODEL_CARDS[model_name]["usd_per_audio_input_token"]
         model_conversion_usd_per_record = (

palimpzest/query/operators/critique_and_refine.py ADDED Viewed

@@ -0,0 +1,194 @@
+from __future__ import annotations
+from typing import Any
+from pydantic.fields import FieldInfo
+from palimpzest.constants import MODEL_CARDS, Cardinality, Model, PromptStrategy
+from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.models import GenerationStats, OperatorCostEstimates
+from palimpzest.query.generators.generators import Generator
+from palimpzest.query.operators.convert import LLMConvert
+from palimpzest.query.operators.filter import LLMFilter
+# TYPE DEFINITIONS
+FieldName = str
+class CritiqueAndRefineConvert(LLMConvert):
+    def __init__(
+        self,
+        critic_model: Model,
+        refine_model: Model,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.critic_model = critic_model
+        self.refine_model = refine_model
+        # create generators
+        self.critic_generator = Generator(self.critic_model, PromptStrategy.MAP_CRITIC, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
+        self.refine_generator = Generator(self.refine_model, PromptStrategy.MAP_REFINE, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Critic Model: {self.critic_model}\n"
+        op += f"    Refine Model: {self.refine_model}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "critic_model": self.critic_model.value,
+            "refine_model": self.refine_model.value,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "critic_model": self.critic_model,
+            "refine_model": self.refine_model,
+            **op_params,
+        }
+        return op_params
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Currently, we are invoking `self.model`, then critiquing its output with `self.critic_model`, and
+        finally refining the output with `self.refine_model`. Thus, we roughly expect to incur the cost
+        and time of three LLMConverts. In practice, this naive quality estimate will be overwritten by the
+        CostModel's estimate once it executes a few instances of the operator.
+        """
+        # get naive cost estimates for first LLM call and multiply by 3 for now;
+        # of course we should sum individual estimates for each model, but this is a rough estimate
+        # and in practice we will need to revamp our naive cost estimates in the near future
+        naive_op_cost_estimates = 3 * super().naive_cost_estimates(source_op_cost_estimates)
+        # for naive setting, estimate quality as quality of refine model
+        model_quality = MODEL_CARDS[self.refine_model.value]["overall"] / 100.0
+        naive_op_cost_estimates.quality = model_quality
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        return naive_op_cost_estimates
+    def convert(self, candidate: DataRecord, fields: dict[str, FieldInfo]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+        # get input fields
+        input_fields = self.get_input_fields()
+        # NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
+        # execute the initial model
+        original_gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
+        field_answers, reasoning, original_gen_stats, original_messages = self.generator(candidate, fields, **original_gen_kwargs)
+        original_output = f"REASONING: {reasoning}\nANSWER: {field_answers}\n"
+        # execute the critic model
+        critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
+        _, reasoning, critic_gen_stats, _ = self.critic_generator(candidate, fields, json_output=False, **critic_gen_kwargs)
+        critique_output = f"CRITIQUE: {reasoning}\n"
+        # execute the refinement model
+        refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
+        field_answers, reasoning, refine_gen_stats, _ = self.refine_generator(candidate, fields, **refine_gen_kwargs)
+        # compute the total generation stats
+        generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats
+        return field_answers, generation_stats
+class CritiqueAndRefineFilter(LLMFilter):
+    def __init__(
+        self,
+        critic_model: Model,
+        refine_model: Model,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.critic_model = critic_model
+        self.refine_model = refine_model
+        # create generators
+        self.critic_generator = Generator(self.critic_model, PromptStrategy.FILTER_CRITIC, self.reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+        self.refine_generator = Generator(self.refine_model, PromptStrategy.FILTER_REFINE, self.reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Critic Model: {self.critic_model}\n"
+        op += f"    Refine Model: {self.refine_model}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "critic_model": self.critic_model.value,
+            "refine_model": self.refine_model.value,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "critic_model": self.critic_model,
+            "refine_model": self.refine_model,
+            **op_params,
+        }
+        return op_params
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Currently, we are invoking `self.model`, then critiquing its output with `self.critic_model`, and
+        finally refining the output with `self.refine_model`. Thus, we roughly expect to incur the cost
+        and time of three LLMFilters. In practice, this naive quality estimate will be overwritten by the
+        CostModel's estimate once it executes a few instances of the operator.
+        """
+        # get naive cost estimates for first LLM call and multiply by 3 for now;
+        # of course we should sum individual estimates for each model, but this is a rough estimate
+        # and in practice we will need to revamp our naive cost estimates in the near future
+        naive_op_cost_estimates = 3 * super().naive_cost_estimates(source_op_cost_estimates)
+        # for naive setting, estimate quality as quality of refine model
+        model_quality = MODEL_CARDS[self.refine_model.value]["overall"] / 100.0
+        naive_op_cost_estimates.quality = model_quality
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        return naive_op_cost_estimates
+    def filter(self, candidate: DataRecord) -> tuple[dict[str, bool], GenerationStats]:
+        # get input fields
+        input_fields = self.get_input_fields()
+        # construct output fields
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the record passed the filter operation")}
+        # NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
+        # execute the initial model
+        original_gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
+        field_answers, reasoning, original_gen_stats, original_messages = self.generator(candidate, fields, **original_gen_kwargs)
+        original_output = f"REASONING: {reasoning}\nANSWER: {str(field_answers['passed_operator']).upper()}\n"
+        # execute the critic model
+        critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
+        _, reasoning, critic_gen_stats, _ = self.critic_generator(candidate, fields, json_output=False, **critic_gen_kwargs)
+        critique_output = f"CRITIQUE: {reasoning}\n"
+        # execute the refinement model
+        refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
+        field_answers, reasoning, refine_gen_stats, _ = self.refine_generator(candidate, fields, **refine_gen_kwargs)
+        # compute the total generation stats
+        generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats
+        return field_answers, generation_stats

palimpzest/query/operators/distinct.py CHANGED Viewed

@@ -35,27 +35,27 @@ class DistinctOp(PhysicalOperator):
     def __call__(self, candidate: DataRecord) -> DataRecordSet:
         # create new DataRecord
-        dr = DataRecord.from_parent(schema=candidate.schema, parent_record=candidate)
+        dr = DataRecord.from_parent(schema=candidate.schema, data_item={}, parent_record=candidate)
         # output record only if it has not been seen before
         record_str = dr.to_json_str(project_cols=self.distinct_cols, bytes_to_str=True, sorted=True)
         record_hash = f"{hash(record_str)}"
-        dr.passed_operator = record_hash not in self._distinct_seen
-        if dr.passed_operator:
+        dr._passed_operator = record_hash not in self._distinct_seen
+        if dr._passed_operator:
             self._distinct_seen.add(record_hash)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=0.0,
             cost_per_record=0.0,
-            passed_operator=dr.passed_operator,
+            passed_operator=dr._passed_operator,
             op_details={k: str(v) for k, v in self.get_id_params().items()},
         )

palimpzest/query/operators/filter.py CHANGED Viewed

@@ -41,11 +41,6 @@ class FilterOp(PhysicalOperator, ABC):
         op_params = super().get_op_params()
         return {"filter": self.filter_obj, "desc": self.desc, **op_params}
-    @abstractmethod
-    def is_image_filter(self) -> bool:
-        """Return True if the filter operation processes an image, False otherwise."""
-        pass
     @abstractmethod
     def filter(self, candidate: DataRecord) -> tuple[dict[str, bool], GenerationStats]:
         """
@@ -76,14 +71,14 @@ class FilterOp(PhysicalOperator, ABC):
         construct the resulting RecordSet.
         """
         # create new DataRecord and set passed_operator attribute
-        dr = DataRecord.from_parent(candidate.schema, parent_record=candidate)
-        dr.passed_operator = passed_operator
+        dr = DataRecord.from_parent(schema=candidate.schema, data_item={}, parent_record=candidate)
+        dr._passed_operator = passed_operator
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
@@ -102,7 +97,6 @@ class FilterOp(PhysicalOperator, ABC):
             total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
             answer=answer,
             passed_operator=passed_operator,
-            image_operation=self.is_image_filter(),
             op_details={k: str(v) for k, v in self.get_id_params().items()},
         )
@@ -127,10 +121,6 @@ class FilterOp(PhysicalOperator, ABC):
 class NonLLMFilter(FilterOp):
-    def is_image_filter(self) -> bool:
-        # NOTE: even if the UDF is processing an image, we do not consider this an image filter
-        # (the output of this function will be used by the CostModel in a way which does not apply to UDFs)
-        return False
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates):
         # estimate output cardinality using a constant assumption of the filter selectivity
@@ -174,7 +164,7 @@ class LLMFilter(FilterOp):
     def __init__(
         self,
         model: Model,
-        prompt_strategy: PromptStrategy = PromptStrategy.COT_BOOL,
+        prompt_strategy: PromptStrategy = PromptStrategy.FILTER,
         reasoning_effort: str | None = None,
         *args,
         **kwargs,
@@ -183,13 +173,14 @@ class LLMFilter(FilterOp):
         self.model = model
         self.prompt_strategy = prompt_strategy
         self.reasoning_effort = reasoning_effort
-        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+        if model is not None:
+            self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
     def get_id_params(self):
         id_params = super().get_id_params()
         id_params = {
-            "model": self.model.value,
-            "prompt_strategy": self.prompt_strategy.value,
+            "model": None if self.model is None else self.model.value,
+            "prompt_strategy": None if self.prompt_strategy is None else self.prompt_strategy.value,
             "reasoning_effort": self.reasoning_effort,
             **id_params,
         }
@@ -208,15 +199,12 @@ class LLMFilter(FilterOp):
         return op_params
     def get_model_name(self):
-        return self.model.value
-    def is_image_filter(self) -> bool:
-        return self.prompt_strategy is PromptStrategy.COT_BOOL_IMAGE
+        return None if self.model is None else self.model.value
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates):
         # estimate number of input tokens from source
         est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS
-        if self.is_image_filter():
+        if self.is_image_op():
             est_num_input_tokens = 765 / 10  # 1024x1024 image is 765 tokens
         # NOTE: the output often generates an entire reasoning sentence, thus the true value may be higher
@@ -232,7 +220,7 @@ class LLMFilter(FilterOp):
         # get est. of conversion cost (in USD) per record from model card
         usd_per_input_token = (
             MODEL_CARDS[self.model.value]["usd_per_audio_input_token"]
-            if self.prompt_strategy.is_audio_prompt()
+            if self.is_audio_op()
             else MODEL_CARDS[self.model.value]["usd_per_input_token"]
         )
         model_conversion_usd_per_record = (

palimpzest 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

palimpzest 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl