PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
palimpzest-0.7.1.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/map.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+import time
+from typing import Callable
+from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.lib.fields import Field
+from palimpzest.query.operators.physical import PhysicalOperator
+class MapOp(PhysicalOperator):
+    def __init__(self, udf: Callable | None = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.udf = udf
+    def __str__(self):
+        op = super().__str__()
+        op += f"    UDF: {self.udf.__name__}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {"udf": self.udf, **id_params}
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {"udf": self.udf, **op_params}
+        return op_params
+    def _create_record_set(
+        self,
+        record: DataRecord,
+        generation_stats: GenerationStats,
+        total_time: float,
+    ) -> DataRecordSet:
+        """
+        Given an input DataRecord and a determination of whether it passed the filter or not,
+        construct the resulting RecordSet.
+        """
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=record.id,
+            record_parent_id=record.parent_id,
+            record_source_idx=record.source_idx,
+            record_state=record.to_dict(include_bytes=False),
+            op_id=self.get_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=total_time,
+            cost_per_record=0.0,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            answer=record.to_dict(include_bytes=False),
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([record], [record_op_stats])
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Compute naive cost estimates for the Map operation. These estimates assume that the map UDF
+        (1) has no cost and (2) has perfect quality.
+        """
+        # estimate 1 ms single-threaded execution for udf function
+        time_per_record = 0.001
+        # assume filter fn has perfect quality
+        return OperatorCostEstimates(
+            cardinality=source_op_cost_estimates.cardinality,
+            time_per_record=time_per_record,
+            cost_per_record=0.0,
+            quality=1.0,
+        )
+    def map(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
+        # apply UDF to input record
+        start_time = time.time()
+        field_answers = {}
+        try:
+            # execute the UDF function
+            field_answers = self.udf(candidate.to_dict())
+            # answer should be a dictionary
+            assert isinstance(field_answers, dict), (
+                "UDF must return a dictionary mapping each input field to its value for map operations"
+            )
+            if self.verbose:
+                print(f"{self.udf.__name__}")
+        except Exception as e:
+            print(f"Error invoking user-defined function for map: {e}")
+            raise e
+        # create generation stats object containing the time spent executing the UDF function
+        generation_stats = GenerationStats(fn_call_duration_secs=time.time() - start_time)
+        return field_answers, generation_stats
+    def __call__(self, candidate: DataRecord) -> DataRecordSet:
+        """
+        This method converts an input DataRecord into an output DataRecordSet. The output DataRecordSet contains the
+        DataRecord(s) output by the operator's convert() method and their corresponding RecordOpStats objects.
+        Some subclasses may override this __call__method to implement their own custom logic.
+        """
+        start_time = time.time()
+        # execute the map operation
+        field_answers: dict[str, list]
+        fields = {field: field_type for field, field_type in self.output_schema.field_map().items()}
+        field_answers, generation_stats = self.map(candidate=candidate, fields=fields)
+        assert all([field in field_answers for field in fields]), "Not all fields are present in output of map!"
+        # construct DataRecord from field_answers
+        dr = DataRecord.from_parent(schema=self.output_schema, parent_record=candidate)
+        for field_name, field_value in field_answers.items():
+            dr[field_name] = field_value
+        # construct and return DataRecordSet
+        record_set = self._create_record_set(
+            record=dr,
+            generation_stats=generation_stats,
+            total_time=time.time() - start_time,
+        )
+        return record_set

palimpzest/query/operators/mixture_of_agents_convert.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from __future__ import annotations
-from typing import Any
 from palimpzest.constants import MODEL_CARDS, Model, PromptStrategy
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.lib.fields import Field
 from palimpzest.query.generators.generators import generator_factory
 from palimpzest.query.operators.convert import LLMConvert
@@ -112,7 +111,7 @@ class MixtureOfAgentsConvert(LLMConvert):
         return naive_op_cost_estimates
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
         # get input fields
         input_fields = self.get_input_fields()
@@ -120,8 +119,9 @@ class MixtureOfAgentsConvert(LLMConvert):
         proposer_model_final_answers, proposer_model_generation_stats = [], []
         for proposer_generator, temperature in zip(self.proposer_generators, self.temperatures):
             gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema, "temperature": temperature}
-            field_answers, reasoning, generation_stats = proposer_generator(candidate, fields, **gen_kwargs)
-            proposer_model_final_answers.append(f"REASONING: {reasoning}\nANSWER:{field_answers}\n")
+            _, reasoning, generation_stats, _ = proposer_generator(candidate, fields, json_output=False, **gen_kwargs)
+            proposer_text = f"REASONING:{reasoning}\n"
+            proposer_model_final_answers.append(proposer_text)
             proposer_model_generation_stats.append(generation_stats)
         # call the aggregator
@@ -130,7 +130,7 @@ class MixtureOfAgentsConvert(LLMConvert):
             "output_schema": self.output_schema,
             "model_responses": proposer_model_final_answers,
         }
-        field_answers, _, aggregator_gen_stats = self.aggregator_generator(candidate, fields, **gen_kwargs)
+        field_answers, _, aggregator_gen_stats, _ = self.aggregator_generator(candidate, fields, **gen_kwargs)
         # compute the total generation stats
         generation_stats = sum(proposer_model_generation_stats) + aggregator_gen_stats

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -125,6 +125,9 @@ class PhysicalOperator:
         self.op_id = hash_for_id(hash_str)
         return self.op_id
+    def get_logical_op_id(self) -> str | None:
+        return self.logical_op_id
     def __hash__(self):
         return int(self.op_id, 16)
@@ -187,15 +190,3 @@ class PhysicalOperator:
     def __call__(self, candidate: DataRecord) -> DataRecordSet:
         raise NotImplementedError("Calling __call__ from abstract method")
-    @staticmethod
-    def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord] | int) -> tuple[DataRecordSet, PhysicalOperator]:
-        """
-        Wrapper function around operator execution which also and returns the operator.
-        This is useful in the parallel setting(s) where operators are executed by a worker pool,
-        and it is convenient to return the op_id along with the computation result.
-        """
-        record_set = operator(op_input)
-        return record_set, operator, op_input

palimpzest/query/operators/rag_convert.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Any
+import time
 from numpy import dot
 from numpy.linalg import norm
@@ -9,11 +9,12 @@ from openai import OpenAI
 from palimpzest.constants import (
     MODEL_CARDS,
     NAIVE_EST_NUM_OUTPUT_TOKENS,
+    Model,
 )
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.fields import StringField
-from palimpzest.query.operators.convert import FieldName, LLMConvert
+from palimpzest.core.lib.fields import Field, StringField
+from palimpzest.query.operators.convert import LLMConvert
 class RAGConvert(LLMConvert):
@@ -21,7 +22,7 @@ class RAGConvert(LLMConvert):
         super().__init__(*args, **kwargs)
         # NOTE: in the future, we should abstract the embedding model to allow for different models
         self.client = None
-        self.embedding_model = "text-embedding-3-small"
+        self.embedding_model = Model.TEXT_EMBEDDING_3_SMALL
         self.num_chunks_per_field = num_chunks_per_field
         self.chunk_size = chunk_size
@@ -93,13 +94,39 @@ class RAGConvert(LLMConvert):
         return chunks
-    def compute_embedding(self, text: str) -> list[float]:
+    def compute_embedding(self, text: str) -> tuple[list[float], GenerationStats]:
         """
-        Compute the embedding for a text string.
+        Compute the embedding for a text string. Return the embedding and the GenerationStats object
+        that captures the cost of the operation.
         """
-        response = self.client.embeddings.create(input=text, model=self.embedding_model)
+        # get the embedding model name
+        model_name = self.embedding_model.value
+        # compute the embedding
+        start_time = time.time()
+        response = self.client.embeddings.create(input=text, model=model_name)
+        total_time = time.time() - start_time
+        # extract the embedding
+        embedding = response.data[0].embedding
+        # compute the generation stats object
+        model_card = MODEL_CARDS[model_name]
+        total_input_tokens = response.usage.total_tokens
+        total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+        embed_stats = GenerationStats(
+            model_name=model_name,  # NOTE: this should be overwritten by generation model in convert()
+            total_input_tokens=total_input_tokens,
+            total_output_tokens=0.0,
+            total_input_cost=total_input_cost,
+            total_output_cost=0.0,
+            cost_per_record=total_input_cost,
+            llm_call_duration_secs=total_time,
+            total_llm_calls=1,
+            total_embedding_llm_calls=1,
+        )
-        return response.data[0].embedding
+        return embedding, embed_stats
     def compute_similarity(self, query_embedding: list[float], chunk_embedding: list[float]) -> float:
         """
@@ -107,18 +134,25 @@ class RAGConvert(LLMConvert):
         """
         return dot(query_embedding, chunk_embedding) / (norm(query_embedding) * norm(chunk_embedding))
-    def get_chunked_candidate(self, candidate: DataRecord, input_fields: list[str], output_fields: list[str]) -> DataRecord:
+    def get_chunked_candidate(self, candidate: DataRecord, input_fields: list[str], output_fields: list[str]) -> tuple[DataRecord, GenerationStats]:
         """
         For each text field, chunk the content and compute the chunk embeddings. Then select the top-k chunks
         for each field. If a field is smaller than the chunk size, simply include the full field.
         """
+        # initialize stats for embedding costs
+        embed_stats = GenerationStats()
         # compute embedding for output fields
         output_fields_desc = ""
         field_desc_map = self.output_schema.field_desc_map()
         for field_name in output_fields:
             output_fields_desc += f"- {field_name}: {field_desc_map[field_name]}\n"
-        query_embedding = self.compute_embedding(output_fields_desc)
+        query_embedding, query_embed_stats = self.compute_embedding(output_fields_desc)
+        # add cost of embedding the query to embed_stats
+        embed_stats += query_embed_stats
+        # for each input field, chunk its content and compute the (per-chunk) embeddings
         for field_name in input_fields:
             field = candidate.get_field_type(field_name)
@@ -133,14 +167,18 @@ class RAGConvert(LLMConvert):
                 candidate[field_name] = "[" + ", ".join(candidate[field_name]) + "]"
             # skip this field if it is a string field and its length is less than the chunk size
-            if isinstance(field, str) and len(candidate[field_name]) < self.chunk_size:
+            if len(candidate[field_name]) < self.chunk_size:
                 continue
             # chunk the content
             chunks = self.chunk_text(candidate[field_name], self.chunk_size)
             # compute embeddings for each chunk
-            chunk_embeddings = [self.compute_embedding(chunk) for chunk in chunks]
+            chunk_embeddings, chunk_embed_stats_lst = zip(*[self.compute_embedding(chunk) for chunk in chunks])
+            # add cost of embedding each chunk to embed_stats
+            for chunk_embed_stats in chunk_embed_stats_lst:
+                embed_stats += chunk_embed_stats
             # select the top-k chunks
             sorted_chunks = sorted(
@@ -154,29 +192,39 @@ class RAGConvert(LLMConvert):
             top_k_chunks = [chunk for _, chunk in sorted(top_k_chunks, key=lambda tup: tup[0])]
             candidate[field_name] = "...".join(top_k_chunks)
-        return candidate
+        return candidate, embed_stats
-    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
         # set client
         self.client = OpenAI() if self.client is None else self.client
         # get the set of input fields to use for the convert operation
         input_fields = self.get_input_fields()
+        output_fields = list(fields.keys())
         # lookup most relevant chunks for each field using embedding search
         candidate_copy = candidate.copy()
-        candidate_copy = self.get_chunked_candidate(candidate_copy, input_fields)
+        candidate_copy, embed_stats = self.get_chunked_candidate(candidate_copy, input_fields, output_fields)
         # construct kwargs for generation
         gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
         # generate outputs for all fields in a single query
-        field_answers, _, generation_stats = self.generator(candidate_copy, fields, **gen_kwargs)
+        field_answers, _, generation_stats, _ = self.generator(candidate_copy, fields, **gen_kwargs)
+        # NOTE: summing embedding stats with generation stats is messy because it will lead to misleading
+        #       measurements of total_input_tokens and total_output_tokens. We should fix this in the future.
+        #       The good news: as long as we compute the cost_per_record of each GenerationStats object correctly,
+        #       then the total cost of the operation will be correct (which will roll-up to correctly computing
+        #       the total cost of the operator, plan, and execution).
+        #
+        # combine stats from embedding with stats for generation
+        generation_stats += embed_stats
         # if there was an error for any field, execute a conventional query on that field
-        for field, answers in field_answers.items():
+        for field_name, answers in field_answers.items():
             if answers is None:
-                single_field_answers, _, single_field_stats = self.generator(candidate_copy, [field], **gen_kwargs)
+                single_field_answers, _, single_field_stats, _ = self.generator(candidate_copy, {field_name: fields[field_name]}, **gen_kwargs)
                 field_answers.update(single_field_answers)
                 generation_stats += single_field_stats

palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl