PyPI - palimpzest - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

palimpzest 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

palimpzest/constants.py +38 -62
palimpzest/core/data/iter_dataset.py +5 -5
palimpzest/core/elements/groupbysig.py +1 -1
palimpzest/core/elements/records.py +91 -109
palimpzest/core/lib/schemas.py +23 -0
palimpzest/core/models.py +3 -3
palimpzest/prompts/__init__.py +2 -6
palimpzest/prompts/convert_prompts.py +10 -66
palimpzest/prompts/critique_and_refine_prompts.py +66 -0
palimpzest/prompts/filter_prompts.py +8 -46
palimpzest/prompts/join_prompts.py +12 -75
palimpzest/prompts/{moa_aggregator_convert_prompts.py → moa_aggregator_prompts.py} +51 -2
palimpzest/prompts/moa_proposer_prompts.py +87 -0
palimpzest/prompts/prompt_factory.py +351 -479
palimpzest/prompts/split_merge_prompts.py +51 -2
palimpzest/prompts/split_proposer_prompts.py +48 -16
palimpzest/prompts/utils.py +109 -0
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/mab_execution_strategy.py +1 -2
palimpzest/query/execution/parallel_execution_strategy.py +3 -3
palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
palimpzest/query/generators/generators.py +31 -17
palimpzest/query/operators/__init__.py +15 -2
palimpzest/query/operators/aggregate.py +21 -19
palimpzest/query/operators/compute.py +6 -8
palimpzest/query/operators/convert.py +12 -37
palimpzest/query/operators/critique_and_refine.py +194 -0
palimpzest/query/operators/distinct.py +7 -7
palimpzest/query/operators/filter.py +13 -25
palimpzest/query/operators/join.py +321 -192
palimpzest/query/operators/limit.py +4 -4
palimpzest/query/operators/mixture_of_agents.py +246 -0
palimpzest/query/operators/physical.py +25 -2
palimpzest/query/operators/project.py +4 -4
palimpzest/query/operators/{rag_convert.py → rag.py} +202 -5
palimpzest/query/operators/retrieve.py +10 -9
palimpzest/query/operators/scan.py +9 -10
palimpzest/query/operators/search.py +18 -24
palimpzest/query/operators/split.py +321 -0
palimpzest/query/optimizer/__init__.py +12 -8
palimpzest/query/optimizer/optimizer.py +12 -10
palimpzest/query/optimizer/rules.py +201 -108
palimpzest/query/optimizer/tasks.py +18 -6
palimpzest/validator/validator.py +7 -9
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/METADATA +3 -8
palimpzest-0.8.3.dist-info/RECORD +95 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
palimpzest/prompts/util_phrases.py +0 -19
palimpzest/query/operators/critique_and_refine_convert.py +0 -113
palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
palimpzest/query/operators/split_convert.py +0 -170
palimpzest-0.8.2.dist-info/RECORD +0 -95
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/WHEEL +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.2.dist-info → palimpzest-0.8.3.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/limit.py CHANGED Viewed

@@ -36,13 +36,13 @@ class LimitScanOp(PhysicalOperator):
         # NOTE: execution layer ensures that no more than self.limit
         #       records are returned to the user by this operator.
         # create new DataRecord
-        dr = DataRecord.from_parent(schema=candidate.schema, parent_record=candidate)
+        dr = DataRecord.from_parent(schema=candidate.schema, data_item={}, parent_record=candidate)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/mixture_of_agents.py ADDED Viewed

@@ -0,0 +1,246 @@
+from __future__ import annotations
+from pydantic.fields import FieldInfo
+from palimpzest.constants import MODEL_CARDS, Cardinality, Model, PromptStrategy
+from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.models import GenerationStats, OperatorCostEstimates
+from palimpzest.query.generators.generators import Generator
+from palimpzest.query.operators.convert import LLMConvert
+from palimpzest.query.operators.filter import LLMFilter
+# TYPE DEFINITIONS
+FieldName = str
+class MixtureOfAgentsConvert(LLMConvert):
+    def __init__(
+        self,
+        proposer_models: list[Model],
+        temperatures: list[float],
+        aggregator_model: Model,
+        *args,
+        **kwargs,
+    ):
+        kwargs["model"] = None
+        kwargs["prompt_strategy"] = None
+        super().__init__(*args, **kwargs)
+        sorted_proposers, sorted_temps = zip(*[(m, t) for m, t in sorted(zip(proposer_models, temperatures), key=lambda pair: pair[0])])
+        self.proposer_models = list(sorted_proposers)
+        self.temperatures = list(sorted_temps)
+        self.aggregator_model = aggregator_model
+        # create generators
+        self.proposer_generators = [
+            Generator(model, PromptStrategy.MAP_MOA_PROPOSER, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
+            for model in proposer_models
+        ]
+        self.aggregator_generator = Generator(aggregator_model, PromptStrategy.MAP_MOA_AGG, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Proposer Models: {self.proposer_models}\n"
+        op += f"    Temperatures: {self.temperatures}\n"
+        op += f"    Aggregator Model: {self.aggregator_model}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "proposer_models": [model.value for model in self.proposer_models],
+            "temperatures": self.temperatures,
+            "aggregator_model": self.aggregator_model.value,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "proposer_models": self.proposer_models,
+            "temperatures": self.temperatures,
+            "aggregator_model": self.aggregator_model,
+            **op_params,
+        }
+        return op_params
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Currently, we are using multiple proposer models with different temperatures to synthesize
+        answers, which are then aggregated and summarized by a single aggregator model. Thus, we
+        roughly expect to incur the cost and time of an LLMConvert * (len(proposer_models) + 1).
+        In practice, this naive quality estimate will be overwritten by the CostModel's estimate
+        once it executes a few instances of the operator.
+        """
+        # temporarily set self.model so that super().naive_cost_estimates(...) can compute an estimate
+        self.model = self.proposer_models[0]
+        # get naive cost estimates for single LLM call and scale it by number of LLMs used in MoA
+        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
+        naive_op_cost_estimates.time_per_record *= (len(self.proposer_models) + 1)
+        naive_op_cost_estimates.time_per_record_lower_bound = naive_op_cost_estimates.time_per_record
+        naive_op_cost_estimates.time_per_record_upper_bound = naive_op_cost_estimates.time_per_record
+        naive_op_cost_estimates.cost_per_record *= (len(self.proposer_models) + 1)
+        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
+        # for naive setting, estimate quality as mean of all model qualities
+        model_qualities = [
+            MODEL_CARDS[model.value]["overall"] / 100.0
+            for model in self.proposer_models + [self.aggregator_model]
+        ]
+        naive_op_cost_estimates.quality = sum(model_qualities)/(len(self.proposer_models) + 1)
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        # reset self.model to be None
+        self.model = None
+        return naive_op_cost_estimates
+    def convert(self, candidate: DataRecord, fields: dict[str, FieldInfo]) -> tuple[dict[str, list], GenerationStats]:
+        # get input fields
+        input_fields = self.get_input_fields()
+        # execute generator models in sequence
+        proposer_model_final_answers, proposer_model_generation_stats = [], []
+        for proposer_generator, temperature in zip(self.proposer_generators, self.temperatures):
+            gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema, "temperature": temperature}
+            _, reasoning, generation_stats, _ = proposer_generator(candidate, fields, json_output=False, **gen_kwargs)
+            proposer_text = f"REASONING: {reasoning}\n"
+            proposer_model_final_answers.append(proposer_text)
+            proposer_model_generation_stats.append(generation_stats)
+        # call the aggregator
+        gen_kwargs = {
+            "project_cols": input_fields,
+            "output_schema": self.output_schema,
+            "model_responses": proposer_model_final_answers,
+        }
+        field_answers, _, aggregator_gen_stats, _ = self.aggregator_generator(candidate, fields, **gen_kwargs)
+        # compute the total generation stats
+        generation_stats = sum(proposer_model_generation_stats) + aggregator_gen_stats
+        return field_answers, generation_stats
+class MixtureOfAgentsFilter(LLMFilter):
+    def __init__(
+        self,
+        proposer_models: list[Model],
+        temperatures: list[float],
+        aggregator_model: Model,
+        *args,
+        **kwargs,
+    ):
+        kwargs["model"] = None
+        kwargs["prompt_strategy"] = None
+        super().__init__(*args, **kwargs)
+        sorted_proposers, sorted_temps = zip(*[(m, t) for m, t in sorted(zip(proposer_models, temperatures), key=lambda pair: pair[0])])
+        self.proposer_models = list(sorted_proposers)
+        self.temperatures = list(sorted_temps)
+        self.aggregator_model = aggregator_model
+        # create generators
+        self.proposer_generators = [
+            Generator(model, PromptStrategy.FILTER_MOA_PROPOSER, self.reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+            for model in proposer_models
+        ]
+        self.aggregator_generator = Generator(aggregator_model, PromptStrategy.FILTER_MOA_AGG, self.reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Proposer Models: {self.proposer_models}\n"
+        op += f"    Temperatures: {self.temperatures}\n"
+        op += f"    Aggregator Model: {self.aggregator_model}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "proposer_models": [model.value for model in self.proposer_models],
+            "temperatures": self.temperatures,
+            "aggregator_model": self.aggregator_model.value,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "proposer_models": self.proposer_models,
+            "temperatures": self.temperatures,
+            "aggregator_model": self.aggregator_model,
+            **op_params,
+        }
+        return op_params
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Currently, we are using multiple proposer models with different temperatures to synthesize
+        answers, which are then aggregated and summarized by a single aggregator model. Thus, we
+        roughly expect to incur the cost and time of an LLMFilter * (len(proposer_models) + 1).
+        In practice, this naive quality estimate will be overwritten by the CostModel's estimate
+        once it executes a few instances of the operator.
+        """
+        # temporarily set self.model so that super().naive_cost_estimates(...) can compute an estimate
+        self.model = self.proposer_models[0]
+        # get naive cost estimates for single LLM call and scale it by number of LLMs used in MoA
+        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
+        naive_op_cost_estimates.time_per_record *= (len(self.proposer_models) + 1)
+        naive_op_cost_estimates.time_per_record_lower_bound = naive_op_cost_estimates.time_per_record
+        naive_op_cost_estimates.time_per_record_upper_bound = naive_op_cost_estimates.time_per_record
+        naive_op_cost_estimates.cost_per_record *= (len(self.proposer_models) + 1)
+        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
+        # for naive setting, estimate quality as mean of all model qualities
+        model_qualities = [
+            MODEL_CARDS[model.value]["overall"] / 100.0
+            for model in self.proposer_models + [self.aggregator_model]
+        ]
+        naive_op_cost_estimates.quality = sum(model_qualities)/(len(self.proposer_models) + 1)
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        # reset self.model to be None
+        self.model = None
+        return naive_op_cost_estimates
+    def filter(self, candidate: DataRecord) -> tuple[dict[str, bool], GenerationStats]:
+        # get input fields
+        input_fields = self.get_input_fields()
+        # construct output fields
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the record passed the filter operation")}
+        # execute generator models in sequence
+        proposer_model_final_answers, proposer_model_generation_stats = [], []
+        for proposer_generator, temperature in zip(self.proposer_generators, self.temperatures):
+            gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition, "temperature": temperature}
+            _, reasoning, generation_stats, _ = proposer_generator(candidate, fields, json_output=False, **gen_kwargs)
+            proposer_text = f"REASONING: {reasoning}\n"
+            proposer_model_final_answers.append(proposer_text)
+            proposer_model_generation_stats.append(generation_stats)
+        # call the aggregator
+        gen_kwargs = {
+            "project_cols": input_fields,
+            "filter_condition": self.filter_obj.filter_condition,
+            "model_responses": proposer_model_final_answers,
+        }
+        field_answers, _, aggregator_gen_stats, _ = self.aggregator_generator(candidate, fields, **gen_kwargs)
+        # compute the total generation stats
+        generation_stats = sum(proposer_model_generation_stats) + aggregator_gen_stats
+        return field_answers, generation_stats

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -4,7 +4,9 @@ import json
 from pydantic import BaseModel
+from palimpzest.constants import Modality
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.lib.schemas import AUDIO_FIELD_TYPES, IMAGE_FIELD_TYPES
 from palimpzest.core.models import OperatorCostEstimates
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -18,8 +20,8 @@ class PhysicalOperator:
     def __init__(
         self,
-        output_schema: BaseModel,
-        input_schema: BaseModel | None = None,
+        output_schema: type[BaseModel],
+        input_schema: type[BaseModel] | None = None,
         depends_on: list[str] | None = None,
         logical_op_id: str | None = None,
         unique_logical_op_id: str | None = None,
@@ -39,6 +41,19 @@ class PhysicalOperator:
         self.verbose = verbose
         self.op_id = None
+        # compute the input modalities (if any) for this physical operator
+        self.input_modalities = None
+        if self.input_schema is not None:
+            self.input_modalities = set()
+            for field in self.input_schema.model_fields.values():
+                field_type = field.annotation
+                if field_type in IMAGE_FIELD_TYPES:
+                    self.input_modalities.add(Modality.IMAGE)
+                elif field_type in AUDIO_FIELD_TYPES:
+                    self.input_modalities.add(Modality.AUDIO)
+                else:
+                    self.input_modalities.add(Modality.TEXT)
         # compute the fields generated by this physical operator
         input_field_names = list(self.input_schema.model_fields) if self.input_schema is not None else []
         self.generated_fields = sorted([
@@ -139,6 +154,14 @@ class PhysicalOperator:
     def get_full_op_id(self):
         return f"{self.get_logical_op_id()}-{self.get_op_id()}"
+    def is_image_op(self) -> bool:
+        """Returns True if this physical operator is designed to handle image data."""
+        return self.input_modalities is not None and Modality.IMAGE in self.input_modalities
+    def is_audio_op(self) -> bool:
+        """Returns True if this physical operator is designed to handle audio data."""
+        return self.input_modalities is not None and Modality.AUDIO in self.input_modalities
     def __hash__(self):
         return int(self.op_id, 16) # NOTE: should we use self.get_full_op_id() instead?

palimpzest/query/operators/project.py CHANGED Viewed

@@ -34,13 +34,13 @@ class ProjectOp(PhysicalOperator):
     def __call__(self, candidate: DataRecord) -> DataRecordSet:
         # create new DataRecord with projection applied
-        dr = DataRecord.from_parent(schema=candidate.schema, parent_record=candidate, project_cols=self.project_cols)
+        dr = DataRecord.from_parent(schema=candidate.schema, data_item={}, parent_record=candidate, project_cols=self.project_cols)
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/{rag_convert.py → rag.py} RENAMED Viewed

@@ -15,6 +15,7 @@ from palimpzest.constants import (
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.models import GenerationStats, OperatorCostEstimates
 from palimpzest.query.operators.convert import LLMConvert
+from palimpzest.query.operators.filter import LLMFilter
 class RAGConvert(LLMConvert):
@@ -26,7 +27,7 @@ class RAGConvert(LLMConvert):
         self.num_chunks_per_field = num_chunks_per_field
         self.chunk_size = chunk_size
-        # crude adjustment factor for naive estimation in no-sentinel setting
+        # crude adjustment factor for naive estimation in unoptimized setting
         self.naive_quality_adjustment = 0.6
     def __str__(self):
@@ -74,10 +75,6 @@ class RAGConvert(LLMConvert):
         return naive_op_cost_estimates
-    def is_image_conversion(self) -> bool:
-        """RAGConvert is currently disallowed on image conversions, so this must be False."""
-        return False
     def chunk_text(self, text: str, chunk_size: int) -> list[str]:
         """
         Given a text string, chunk it into substrings of length chunk_size.
@@ -228,3 +225,203 @@ class RAGConvert(LLMConvert):
                 generation_stats += single_field_stats
         return field_answers, generation_stats
+class RAGFilter(LLMFilter):
+    def __init__(self, num_chunks_per_field: int, chunk_size: int = 1000, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # NOTE: in the future, we should abstract the embedding model to allow for different models
+        self.client = None
+        self.embedding_model = Model.TEXT_EMBEDDING_3_SMALL
+        self.num_chunks_per_field = num_chunks_per_field
+        self.chunk_size = chunk_size
+        # crude adjustment factor for naive estimation in no-sentinel setting
+        self.naive_quality_adjustment = 0.6
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Number of Chunks: {str(self.num_chunks_per_field)}\n"
+        op += f"    Chunk Size: {str(self.chunk_size)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {"num_chunks_per_field": self.num_chunks_per_field, "chunk_size": self.chunk_size, **id_params}
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"num_chunks_per_field": self.num_chunks_per_field, "chunk_size": self.chunk_size, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Update the cost per record and quality estimates produced by LLMFilter's naive estimates.
+        We adjust the cost per record to account for the reduced number of input tokens following
+        the retrieval of relevant chunks, and we make a crude estimate of the quality degradation
+        that results from using a downsized input (although this may in fact improve quality in
+        some cases).
+        """
+        # get naive cost estimates from LLMFilter
+        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
+        # re-compute cost per record assuming we use fewer input tokens; naively assume a single input field
+        est_num_input_tokens = self.num_chunks_per_field * self.chunk_size
+        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
+        model_conversion_usd_per_record = (
+            MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
+            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # set refined estimate of cost per record
+        naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
+        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * self.naive_quality_adjustment
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        return naive_op_cost_estimates
+    def chunk_text(self, text: str, chunk_size: int) -> list[str]:
+        """
+        Given a text string, chunk it into substrings of length chunk_size.
+        """
+        chunks = []
+        idx = 0
+        while idx + chunk_size < len(text):
+            chunks.append(text[idx : idx + chunk_size])
+            idx += chunk_size
+        if idx < len(text):
+            chunks.append(text[idx:])
+        return chunks
+    def compute_embedding(self, text: str) -> tuple[list[float], GenerationStats]:
+        """
+        Compute the embedding for a text string. Return the embedding and the GenerationStats object
+        that captures the cost of the operation.
+        """
+        # get the embedding model name
+        model_name = self.embedding_model.value
+        # compute the embedding
+        start_time = time.time()
+        response = self.client.embeddings.create(input=text, model=model_name)
+        total_time = time.time() - start_time
+        # extract the embedding
+        embedding = response.data[0].embedding
+        # compute the generation stats object
+        model_card = MODEL_CARDS[model_name]
+        total_input_tokens = response.usage.total_tokens
+        total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+        embed_stats = GenerationStats(
+            model_name=model_name,  # NOTE: this should be overwritten by generation model in filter()
+            total_input_tokens=total_input_tokens,
+            total_output_tokens=0.0,
+            total_input_cost=total_input_cost,
+            total_output_cost=0.0,
+            cost_per_record=total_input_cost,
+            llm_call_duration_secs=total_time,
+            total_llm_calls=1,
+            total_embedding_llm_calls=1,
+        )
+        return embedding, embed_stats
+    def compute_similarity(self, query_embedding: list[float], chunk_embedding: list[float]) -> float:
+        """
+        Compute the similarity between the query and chunk embeddings.
+        """
+        return dot(query_embedding, chunk_embedding) / (norm(query_embedding) * norm(chunk_embedding))
+    def get_chunked_candidate(self, candidate: DataRecord, input_fields: list[str]) -> tuple[DataRecord, GenerationStats]:
+        """
+        For each text field, chunk the content and compute the chunk embeddings. Then select the top-k chunks
+        for each field. If a field is smaller than the chunk size, simply include the full field.
+        """
+        # initialize stats for embedding costs
+        embed_stats = GenerationStats()
+        # compute embedding for filter condition
+        query_embedding, query_embed_stats = self.compute_embedding(self.filter_obj.filter_condition)
+        # add cost of embedding the query to embed_stats
+        embed_stats += query_embed_stats
+        # for each input field, chunk its content and compute the (per-chunk) embeddings
+        for field_name in input_fields:
+            field = candidate.get_field_type(field_name)
+            # skip this field if it is not a string or a list of strings
+            is_string_field = field.annotation in [str, str | None]
+            is_list_string_field = field.annotation in [list[str], list[str] | None]
+            if not (is_string_field or is_list_string_field):
+                continue
+            # if this is a list of strings, join the strings
+            if is_list_string_field:
+                candidate[field_name] = "[" + ", ".join(candidate[field_name]) + "]"
+            # skip this field if it is a string field and its length is less than the chunk size
+            if len(candidate[field_name]) < self.chunk_size:
+                continue
+            # chunk the content
+            chunks = self.chunk_text(candidate[field_name], self.chunk_size)
+            # compute embeddings for each chunk
+            chunk_embeddings, chunk_embed_stats_lst = zip(*[self.compute_embedding(chunk) for chunk in chunks])
+            # add cost of embedding each chunk to embed_stats
+            for chunk_embed_stats in chunk_embed_stats_lst:
+                embed_stats += chunk_embed_stats
+            # select the top-k chunks
+            sorted_chunks = sorted(
+                zip(range(len(chunks)), chunks, chunk_embeddings),
+                key=lambda tup: self.compute_similarity(query_embedding, tup[2]),
+                reverse=True,
+            )
+            top_k_chunks = [(chunk_idx, chunk) for chunk_idx, chunk, _ in sorted_chunks[:self.num_chunks_per_field]]
+            # sort the top-k chunks by their original index in the content, and join them with ellipses
+            top_k_chunks = [chunk for _, chunk in sorted(top_k_chunks, key=lambda tup: tup[0])]
+            candidate[field_name] = "...".join(top_k_chunks)
+        return candidate, embed_stats
+    def filter(self, candidate: DataRecord) -> tuple[dict[str, bool], GenerationStats]:
+        # set client
+        self.client = OpenAI() if self.client is None else self.client
+        # get the set of input fields to use for the filter operation
+        input_fields = self.get_input_fields()
+        # construct output fields
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the record passed the filter operation")}
+        # lookup most relevant chunks for each field using embedding search
+        candidate_copy = candidate.copy()
+        candidate_copy, embed_stats = self.get_chunked_candidate(candidate_copy, input_fields)
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
+        # generate outputs for all fields in a single query
+        field_answers, _, generation_stats, _ = self.generator(candidate_copy, fields, **gen_kwargs)
+        # NOTE: summing embedding stats with generation stats is messy because it will lead to misleading
+        #       measurements of total_input_tokens and total_output_tokens. We should fix this in the future.
+        #       The good news: as long as we compute the cost_per_record of each GenerationStats object correctly,
+        #       then the total cost of the operation will be correct (which will roll-up to correctly computing
+        #       the total cost of the operator, plan, and execution).
+        #
+        # combine stats from embedding with stats for generation
+        generation_stats += embed_stats
+        return field_answers, generation_stats

palimpzest/query/operators/retrieve.py CHANGED Viewed

@@ -145,11 +145,11 @@ class RetrieveOp(PhysicalOperator):
         Given an input DataRecord and the top_k_results, construct the resulting RecordSet.
         """
         # create output DataRecord an set the output attribute
-        output_dr, answer = DataRecord.from_parent(self.output_schema, parent_record=candidate), {}
-        for output_field_name in self.output_field_names:
-            top_k_attr_results = None if top_k_results is None else top_k_results[output_field_name]
-            setattr(output_dr, output_field_name, top_k_attr_results)
-            answer[output_field_name] = top_k_attr_results
+        data_item = {
+            output_field_name: None if top_k_results is None else top_k_results[output_field_name]
+            for output_field_name in self.output_field_names
+        }
+        output_dr = DataRecord.from_parent(self.output_schema, data_item, parent_record=candidate)
         # get the record_state and generated fields
         record_state = output_dr.to_dict(include_bytes=False)
@@ -159,16 +159,17 @@ class RetrieveOp(PhysicalOperator):
         # construct the RecordOpStats object
         record_op_stats = RecordOpStats(
-            record_id=output_dr.id,
-            record_parent_ids=output_dr.parent_ids,
-            record_source_indices=output_dr.source_indices,
+            record_id=output_dr._id,
+            record_parent_ids=output_dr._parent_ids,
+            record_source_indices=output_dr._source_indices,
             record_state=record_state,
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=total_time,
             cost_per_record=generation_stats.cost_per_record,
-            answer=answer,
+            total_embedding_cost=generation_stats.cost_per_record,
+            answer=data_item,
             input_fields=list(self.input_schema.model_fields),
             generated_fields=generated_fields,
             fn_call_duration_secs=total_time - generation_stats.llm_call_duration_secs,

palimpzest/query/operators/scan.py CHANGED Viewed

@@ -71,15 +71,14 @@ class ScanPhysicalOp(PhysicalOperator, ABC):
         assert all([field in item for field in output_field_names]), f"Some fields in Dataset schema not present in item!\n - Dataset fields: {output_field_names}\n - Item fields: {list(item.keys())}"
         # construct a DataRecord from the item
-        dr = DataRecord(self.output_schema, source_indices=[f"{self.datasource.id}-{idx}"])
-        for field in output_field_names:
-            setattr(dr, field, item[field])
+        data_item = self.output_schema(**{field: item[field] for field in output_field_names})
+        dr = DataRecord(data_item, source_indices=[f"{self.datasource.id}-{idx}"])
         # create RecordOpStats objects
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
@@ -170,15 +169,15 @@ class ContextScanOp(PhysicalOperator):
         """
         # construct a DataRecord from the context
         start_time = time.time()
-        dr = DataRecord(self.output_schema, source_indices=[f"{self.context.id}-{0}"])
+        dr = DataRecord(self.output_schema(), source_indices=[f"{self.context.id}-{0}"])
         dr.context = self.context
         end_time = time.time()
         # create RecordOpStats objects
         record_op_stats = RecordOpStats(
-            record_id=dr.id,
-            record_parent_ids=dr.parent_ids,
-            record_source_indices=dr.source_indices,
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl

palimpzest 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl