PyPI - palimpzest - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.3.dist-info/RECORD +0 -87
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/retrieve.py CHANGED Viewed

@@ -2,31 +2,73 @@ from __future__ import annotations
 import os
 import time
+from typing import Callable
-from palimpzest.core.data.dataclasses import OperatorCostEstimates, RecordOpStats
+from chromadb.api.models.Collection import Collection
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
+from openai import OpenAI
+from ragatouille.RAGPretrainedModel import RAGPretrainedModel
+from sentence_transformers import SentenceTransformer
+from palimpzest.constants import MODEL_CARDS, Model
+from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.lib.schemas import Schema
 from palimpzest.query.operators.physical import PhysicalOperator
 class RetrieveOp(PhysicalOperator):
-    def __init__(self, index, search_func, search_attr, output_attr, k, *args, **kwargs):
+    def __init__(
+        self,
+        index: Collection | RAGPretrainedModel,
+        search_attr: str,
+        output_attrs: list[dict] | type[Schema],
+        search_func: Callable | None,
+        k: int,
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize the RetrieveOp object.
+        Args:
+            index (Collection | RAGPretrainedModel): The PZ index to use for retrieval.
+            search_attr (str): The attribute to search on.
+            output_attrs (list[dict]): The output fields containing the results of the search.
+            search_func (Callable | None): The function to use for searching the index. If None, the default search function will be used.
+            k (int): The number of top results to retrieve.
+        """
         super().__init__(*args, **kwargs)
+        # extract the field names from the output_attrs
+        if isinstance(output_attrs, Schema):
+            self.output_field_names = output_attrs.field_names()
+        elif isinstance(output_attrs, list):
+            self.output_field_names = [attr["name"] for attr in output_attrs]
+        else:
+            raise ValueError("`output_attrs` must be a list of dicts or a Schema object.")
+        if len(self.output_field_names) != 1 and search_func is None:
+            raise ValueError("If `search_func` is None, `output_attrs` must have a single field.")
         self.index = index
-        self.search_func = search_func
         self.search_attr = search_attr
-        self.output_attr = output_attr
+        self.output_attrs = output_attrs
+        self.search_func = search_func if search_func is not None else self.default_search_func
         self.k = k
     def __str__(self):
         op = super().__str__()
-        op += f"    Retrieve: {str(self.index)} with top {self.k}\n"
+        op += f"    Retrieve: {self.index.__class__.__name__} with top {self.k}\n"
         return op
     def get_id_params(self):
         id_params = super().get_id_params()
         id_params = {
+            "index": self.index.__class__.__name__,
             "search_attr": self.search_attr,
-            "output_attr": self.output_attr,
+            "output_attrs": self.output_attrs,
             "k": self.k,
             **id_params,
         }
@@ -39,7 +81,7 @@ class RetrieveOp(PhysicalOperator):
             "index": self.index,
             "search_func": self.search_func,
             "search_attr": self.search_attr,
-            "output_attr": self.output_attr,
+            "output_attrs": self.output_attrs,
             "k": self.k,
             **op_params,
         }
@@ -53,37 +95,86 @@ class RetrieveOp(PhysicalOperator):
         """
         return OperatorCostEstimates(
             cardinality=source_op_cost_estimates.cardinality,
-            time_per_record=0.001,  # estimate 1 ms single-threaded execution for index lookup
-            cost_per_record=0.0,
+            time_per_record=0.01 * self.k,   # estimate 10 ms execution lookup per output
+            cost_per_record=0.001 * self.k,  # estimate small marginal cost of lookups
             quality=1.0,
         )
-    def __call__(self, candidate: DataRecord) -> DataRecordSet:
-        start_time = time.time()
+    def default_search_func(self, index: Collection | RAGPretrainedModel, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
+        """
+        Default search function for the Retrieve operation. This function uses the index to
+        retrieve the top-k results for the given query. The query will be a (possibly singleton)
+        list of strings or a list of lists of floats (i.e., embeddings). The function will return
+        the top-k results per-query in (descending) sorted order. If the input is a singleton list,
+        then the output will be a list of strings. If the input is a list of lists, then the output
+        will be a list of lists of strings.
-        query = getattr(candidate, self.search_attr)
+        Args:
+            index (PZIndex): The index to use for retrieval.
+            query (list[str] | list[list[float]]): The query (or queries) to search for.
+            k (int): The maximum number of results the retrieve operator will return.
-        try:
-            top_k_results = self.search_func(self.index, query, self.k)
-        except Exception:
-            top_k_results = ["error-in-retrieve"]
-            os.makedirs("retrieve-errors", exist_ok=True)
-            ts = time.time()
-            with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
-                f.write(str(query))
+        Returns:
+            list[str] | list[list[str]]: The top results in (descending) sorted order per query.
+        """
+        # check if the input is a singleton list or a list of lists
+        is_singleton_list = len(query) == 1
+        if isinstance(index, Collection):
+            # if the index is a chromadb collection, use the query method
+            results = index.query(query, n_results=k)
+            # the results["documents"] will be a list[list[str]]; if the input is a singleton list,
+            # then we output the list of strings (i.e., the first element of the list), otherwise
+            # we output the list of lists
+            final_results = results["documents"][0] if is_singleton_list else results["documents"]
+            # NOTE: self.output_field_names must be a singleton for default_search_func to be used
+            return {self.output_field_names[0]: final_results}
-        output_dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
-        setattr(output_dr, self.output_attr, top_k_results)
+        elif isinstance(index, RAGPretrainedModel):
+            # if the index is a rag model, use the rag model to get the top k results
+            results = index.search(query, k=k)
+            # the results will be a list[dict]; if the input is a singleton list, however
+            # it will be a list[list[dict]]; if the input is a list of lists
+            final_results = []
+            if is_singleton_list:
+                final_results = [result["content"] for result in results]
+            else:
+                for query_results in results:
+                    final_results.append([result["content"] for result in query_results])
+            # NOTE: self.output_field_names must be a singleton for default_search_func to be used
+            return {self.output_field_names[0]: final_results}
+        else:
+            raise ValueError("Unsupported index type. Must be either a Collection or RAGPretrainedModel.")
+    def _create_record_set(
+        self,
+        candidate: DataRecord,
+        top_k_results: dict[str, list[str] | list[list[str]]] | None,
+        generation_stats: GenerationStats,
+        total_time: float,
+    ) -> DataRecordSet:
+        """
+        Given an input DataRecord and the top_k_results, construct the resulting RecordSet.
+        """
+        # create output DataRecord an set the output attribute
+        output_dr, answer = DataRecord.from_parent(self.output_schema, parent_record=candidate), {}
+        for output_field_name in self.output_field_names:
+            top_k_attr_results = None if top_k_results is None else top_k_results[output_field_name]
+            setattr(output_dr, output_field_name, top_k_attr_results)
+            answer[output_field_name] = top_k_attr_results
-        duration_secs = time.time() - start_time
-        answer = {self.output_attr: top_k_results}
+        # get the record_state and generated fields
         record_state = output_dr.to_dict(include_bytes=False)
-        # NOTE: right now this should be equivalent to [self.output_attr], but in the future we may
-        #       want to support the RetrieveOp generating multiple fields. (Also, the function will
-        #       return the full field name (as opposed to the short field name))
-        generated_fields = self.get_fields_to_generate(candidate)
+        # NOTE: this should be equivalent to self.get_fields_to_generate()
+        generated_fields = self.output_field_names
+        # construct the RecordOpStats object
         record_op_stats = RecordOpStats(
             record_id=output_dr.id,
             record_parent_id=output_dr.parent_id,
@@ -92,19 +183,124 @@ class RetrieveOp(PhysicalOperator):
             op_id=self.get_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
-            time_per_record=duration_secs,
-            cost_per_record=0.0,
+            time_per_record=total_time,
+            cost_per_record=generation_stats.cost_per_record,
             answer=answer,
             input_fields=self.input_schema.field_names(),
             generated_fields=generated_fields,
-            fn_call_duration_secs=duration_secs,
+            fn_call_duration_secs=total_time - generation_stats.llm_call_duration_secs,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
             op_details={k: str(v) for k, v in self.get_id_params().items()},
         )
         drs = [output_dr]
         record_op_stats_lst = [record_op_stats]
-        # construct record set
-        record_set = DataRecordSet(drs, record_op_stats_lst)
+        # construct and return the record set
+        return DataRecordSet(drs, record_op_stats_lst)
-        return record_set
+    def __call__(self, candidate: DataRecord) -> DataRecordSet:
+        start_time = time.time()
+        # check that query is a string or list of strings, otherwise return output with self.output_field_names set to None
+        query = getattr(candidate, self.search_attr)
+        query_is_str = isinstance(query, str)
+        query_is_list_of_str = isinstance(query, list) and all(isinstance(q, str) for q in query)
+        if not query_is_str and not query_is_list_of_str:
+            return self._create_record_set(
+                candidate=candidate,
+                top_k_results=None,
+                generation_stats=GenerationStats(),
+                total_time=time.time() - start_time,
+            )
+        # if query is a string, convert it to a list of strings
+        if query_is_str:
+            query = [query]
+        # compute input/query embedding(s) if the index is a chromadb collection
+        inputs, gen_stats = None, GenerationStats()
+        if isinstance(self.index, Collection):
+            uses_openai_embedding_fcn = isinstance(self.index._embedding_function, OpenAIEmbeddingFunction)
+            uses_sentence_transformer_embedding_fcn = isinstance(self.index._embedding_function, SentenceTransformerEmbeddingFunction)
+            error_msg = "ChromaDB index must use OpenAI or SentenceTransformer embedding function; see: https://docs.trychroma.com/integrations/embedding-models/openai"
+            assert uses_openai_embedding_fcn or uses_sentence_transformer_embedding_fcn, error_msg
+            model_name = self.index._embedding_function._model_name if uses_openai_embedding_fcn else "clip-ViT-B-32"
+            err_msg = f"For Chromadb, we currently only support `text-embedding-3-small` and `clip-ViT-B-32`; your index uses: {model_name}"
+            assert model_name in [Model.TEXT_EMBEDDING_3_SMALL.value, Model.CLIP_VIT_B_32.value], err_msg
+            # compute embeddings
+            try:
+                embed_start_time = time.time()
+                total_input_tokens = 0.0
+                if uses_openai_embedding_fcn:
+                    client = OpenAI()
+                    response = client.embeddings.create(input=query, model=model_name)
+                    total_input_tokens = response.usage.total_tokens
+                    inputs = [item.embedding for item in response.data]
+                elif uses_sentence_transformer_embedding_fcn:
+                    model = SentenceTransformer(model_name)
+                    inputs = model.encode(query)
+                embed_total_time = time.time() - embed_start_time
+                # compute cost of embedding(s)
+                model_card = MODEL_CARDS[model_name]
+                total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+                gen_stats = GenerationStats(
+                    model_name=model_name,
+                    total_input_tokens=total_input_tokens,
+                    total_output_tokens=0.0,
+                    total_input_cost=total_input_cost,
+                    total_output_cost=0.0,
+                    cost_per_record=total_input_cost,
+                    llm_call_duration_secs=embed_total_time,
+                    total_llm_calls=1,
+                    total_embedding_llm_calls=len(query),
+                )
+            except Exception:
+                query = None
+        # in the default case, pass string inputs rather than embeddings
+        if inputs is None:
+            inputs = query
+        try:
+            assert inputs is not None, "Error: inputs is None (likely because embedding generation failed)"
+            top_results = self.search_func(self.index, inputs, self.k)
+        except Exception:
+            top_results = ["error-in-retrieve"]
+            os.makedirs("retrieve-errors", exist_ok=True)
+            ts = time.time()
+            with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
+                f.write(str(query))
+        # TODO: the user is always right! let's drop this post-processing in the future
+        # filter top_results for the top_k_results
+        top_k_results = {output_field_name: [] for output_field_name in self.output_field_names}
+        for output_field_name in self.output_field_names:
+            if output_field_name in top_results:
+                if all([isinstance(result, list) for result in top_results[output_field_name]]):
+                    for result in top_results[output_field_name]:
+                        top_k_results[output_field_name].append(result[:self.k])
+                else:
+                    top_k_results[output_field_name] = top_results[output_field_name][:self.k]
+            else:
+                top_k_results[output_field_name] = []
+        if self.verbose:
+            print(f"Top {self.k} results: {top_k_results}")
+        # construct and return the record set
+        return self._create_record_set(
+            candidate=candidate,
+            top_k_results=top_k_results,
+            generation_stats=gen_stats,
+            total_time=time.time() - start_time,
+        )

palimpzest/query/operators/scan.py CHANGED Viewed

@@ -69,14 +69,17 @@ class ScanPhysicalOp(PhysicalOperator, ABC):
         item = self.datareader[idx]
         end_time = time.time()
+        # TODO: remove once validation data is refactored
+        item_field_dict = item.get("fields", item)
         # check that item covers fields in output schema
         output_field_names = self.output_schema.field_names()
-        assert all([field in item for field in output_field_names]), f"Some fields in DataReader schema not present in item!\n - DataReader fields: {output_field_names}\n - Item fields: {list(item.keys())}"
+        assert all([field in item_field_dict for field in output_field_names]), f"Some fields in DataReader schema not present in item!\n - DataReader fields: {output_field_names}\n - Item fields: {list(item.keys())}"
         # construct a DataRecord from the item
         dr = DataRecord(self.output_schema, source_idx=idx)
         for field in output_field_names:
-            setattr(dr, field, item[field])
+            setattr(dr, field, item_field_dict[field])
         # create RecordOpStats objects
         record_op_stats = RecordOpStats(

palimpzest/query/operators/split_convert.py ADDED Viewed

@@ -0,0 +1,169 @@
+from __future__ import annotations
+import math
+from palimpzest.constants import (
+    MODEL_CARDS,
+    NAIVE_EST_NUM_INPUT_TOKENS,
+    NAIVE_EST_NUM_OUTPUT_TOKENS,
+    PromptStrategy,
+)
+from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
+from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.lib.fields import Field, StringField
+from palimpzest.query.generators.generators import generator_factory
+from palimpzest.query.operators.convert import LLMConvert
+class SplitConvert(LLMConvert):
+    def __init__(self, num_chunks: int = 2, min_size_to_chunk: int = 1000, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_chunks = num_chunks
+        self.min_size_to_chunk = min_size_to_chunk
+        self.split_generator = generator_factory(self.model, PromptStrategy.SPLIT_PROPOSER, self.cardinality, self.verbose)
+        self.split_merge_generator = generator_factory(self.model, PromptStrategy.SPLIT_MERGER, self.cardinality, self.verbose)
+        # crude adjustment factor for naive estimation in no-sentinel setting
+        self.naive_quality_adjustment = 0.6
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Chunk Size: {str(self.num_chunks)}\n"
+        op += f"    Min Size to Chunk: {str(self.min_size_to_chunk)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {"num_chunks": self.num_chunks, "min_size_to_chunk": self.min_size_to_chunk, **id_params}
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"num_chunks": self.num_chunks, "min_size_to_chunk": self.min_size_to_chunk, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Update the cost per record and quality estimates produced by LLMConvert's naive estimates.
+        We adjust the cost per record to account for the reduced number of input tokens following
+        the retrieval of relevant chunks, and we make a crude estimate of the quality degradation
+        that results from using a downsized input (although this may in fact improve quality in
+        some cases).
+        """
+        # get naive cost estimates from LLMConvert
+        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
+        # re-compute cost per record assuming we use fewer input tokens; naively assume a single input field
+        est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS
+        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
+        model_conversion_usd_per_record = (
+            MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
+            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # set refined estimate of cost per record and, for now,
+        # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
+        naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
+        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
+        naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * self.naive_quality_adjustment
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        return naive_op_cost_estimates
+    def is_image_conversion(self) -> bool:
+        """SplitConvert is currently disallowed on image conversions, so this must be False."""
+        return False
+    def get_text_chunks(self, text: str, num_chunks: int) -> list[str]:
+        """
+        Given a text string, chunk it into num_chunks substrings of roughly equal size.
+        """
+        chunks = []
+        idx, chunk_size = 0, math.ceil(len(text) / num_chunks)
+        while idx + chunk_size < len(text):
+            chunks.append(text[idx : idx + chunk_size])
+            idx += chunk_size
+        if idx < len(text):
+            chunks.append(text[idx:])
+        return chunks
+    def get_chunked_candidate(self, candidate: DataRecord, input_fields: list[str]) -> list[DataRecord]:
+        """
+        For each text field, chunk the content. If a field is smaller than the chunk size,
+        simply include the full field.
+        """
+        # compute mapping from each field to its chunked content
+        field_name_to_chunked_content = {}
+        for field_name in input_fields:
+            field = candidate.get_field_type(field_name)
+            content = candidate[field_name]
+            # do not chunk this field if it is not a string or a list of strings
+            is_string_field = isinstance(field, StringField)
+            is_list_string_field = hasattr(field, "element_type") and isinstance(field.element_type, StringField)
+            if not (is_string_field or is_list_string_field):
+                field_name_to_chunked_content[field_name] = [content]
+                continue
+            # if this is a list of strings, join the strings
+            if is_list_string_field:
+                content = "[" + ", ".join(content) + "]"
+            # skip this field if its length is less than the min size to chunk
+            if len(content) < self.min_size_to_chunk:
+                field_name_to_chunked_content[field_name] = [content]
+                continue
+            # chunk the content
+            field_name_to_chunked_content[field_name] = self.get_text_chunks(content, self.num_chunks)
+        # compute the true number of chunks (may be 1 if all fields are not chunked)
+        num_chunks = max(len(chunks) for chunks in field_name_to_chunked_content.values())
+        # create the chunked canidates
+        candidates = []
+        for chunk_idx in range(num_chunks):
+            candidate_copy = candidate.copy()
+            for field_name in input_fields:
+                field_chunks = field_name_to_chunked_content[field_name]
+                candidate_copy[field_name] = field_chunks[chunk_idx] if len(field_chunks) > 1 else field_chunks[0]
+            candidates.append(candidate_copy)
+        return candidates
+    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
+        # get the set of input fields to use for the convert operation
+        input_fields = self.get_input_fields()
+        # lookup most relevant chunks for each field using embedding search
+        candidate_copy = candidate.copy()
+        chunked_candidates = self.get_chunked_candidate(candidate_copy, input_fields)
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
+        # generate outputs for each chunk separately
+        chunk_outputs, chunk_generation_stats_lst = [], []
+        for candidate in chunked_candidates:
+            _, reasoning, chunk_generation_stats, _ = self.split_generator(candidate, fields, json_output=False, **gen_kwargs)
+            chunk_outputs.append(reasoning)
+            chunk_generation_stats_lst.append(chunk_generation_stats)
+        # call the merger
+        gen_kwargs = {
+            "project_cols": input_fields,
+            "output_schema": self.output_schema,
+            "chunk_outputs": chunk_outputs,
+        }
+        field_answers, _, merger_gen_stats, _ = self.split_merge_generator(candidate, fields, **gen_kwargs)
+        # compute the total generation stats
+        generation_stats = sum(chunk_generation_stats_lst) + merger_gen_stats
+        return field_answers, generation_stats

palimpzest/query/operators/token_reduction_convert.py CHANGED Viewed

@@ -9,7 +9,7 @@ from palimpzest.constants import (
     NAIVE_EST_NUM_OUTPUT_TOKENS,
 )
 from palimpzest.core.data.dataclasses import OperatorCostEstimates
-from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded, LLMConvertConventional
+from palimpzest.query.operators.convert import LLMConvertBonded
 from palimpzest.utils.token_reduction_helpers import best_substring_match, find_best_range
@@ -32,8 +32,8 @@ from palimpzest.utils.token_reduction_helpers import best_substring_match, find_
 #          - this also creates difficulties in properly performing cost-estimation for this operator; e.g. if we use
 #            n <= MAX_HEATMAP_UPDATES samples to cost this operator, then we will never actually measure its performance
 #            in the token reduction phase -- which could have a serious degradation in quality that our optimizer doesn't see
-class TokenReducedConvert(LLMConvert):
-    # NOTE: moving these closer to the TokenReducedConvert class for now (in part to make
+class TokenReducedConvertBonded(LLMConvertBonded):
+    # NOTE: moving these closer to the TokenReducedConvertBonded class for now (in part to make
     #       them easier to mock); we can make these parameterized as well
     MAX_HEATMAP_UPDATES: int = 5
     TOKEN_REDUCTION_SAMPLE: int = 0
@@ -90,9 +90,9 @@ class TokenReducedConvert(LLMConvert):
         naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
         return naive_op_cost_estimates
     def is_image_conversion(self) -> bool:
-        """TokenReducedConvert is currently disallowed on image conversions, so this must be False."""
+        """TokenReducedConvertBonded is currently disallowed on image conversions, so this must be False."""
         return False
     def reduce_context(self, full_context: str) -> str:
@@ -119,7 +119,9 @@ class TokenReducedConvert(LLMConvert):
         return sample
     def _dspy_generate_fields(self, prompt: str, content: str | list[str]) -> tuple[list[dict[str, list]] | Any]:
-        raise Exception("TokenReducedConvert is executing despite being deprecated until implementation changes can be made.")
+        raise Exception(
+            "TokenReducedConvertBonded is executing despite being deprecated until implementation changes can be made."
+        )
         answer, query_stats = None, None
         if self.first_execution or self.count < self.MAX_HEATMAP_UPDATES:
             if self.verbose:
@@ -165,11 +167,3 @@ class TokenReducedConvert(LLMConvert):
         self.heatmap[norm_si:norm_ei] = map(lambda x: x + 1, self.heatmap[norm_si:norm_ei])
         return answer, query_stats
-class TokenReducedConvertConventional(TokenReducedConvert, LLMConvertConventional):
-    pass
-class TokenReducedConvertBonded(TokenReducedConvert, LLMConvertBonded):
-    pass

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -19,12 +19,6 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     LLMConvertBondedRule as _LLMConvertBondedRule,
 )
-from palimpzest.query.optimizer.rules import (
-    LLMConvertConventionalRule as _LLMConvertConventionalRule,
-)
-from palimpzest.query.optimizer.rules import (
-    LLMConvertRule as _LLMConvertRule,
-)
 from palimpzest.query.optimizer.rules import (
     LLMFilterRule as _LLMFilterRule,
 )
@@ -50,13 +44,10 @@ from palimpzest.query.optimizer.rules import (
     Rule as _Rule,
 )
 from palimpzest.query.optimizer.rules import (
-    TokenReducedConvertBondedRule as _TokenReducedConvertBondedRule,
-)
-from palimpzest.query.optimizer.rules import (
-    TokenReducedConvertConventionalRule as _TokenReducedConvertConventionalRule,
+    SplitConvertRule as _SplitConvertRule,
 )
 from palimpzest.query.optimizer.rules import (
-    TokenReducedConvertRule as _TokenReducedConvertRule,
+    TokenReducedConvertBondedRule as _TokenReducedConvertBondedRule,
 )
 from palimpzest.query.optimizer.rules import (
     TransformationRule as _TransformationRule,
@@ -70,8 +61,6 @@ ALL_RULES = [
     _CriticAndRefineConvertRule,
     _ImplementationRule,
     _LLMConvertBondedRule,
-    _LLMConvertConventionalRule,
-    _LLMConvertRule,
     _LLMFilterRule,
     _MixtureOfAgentsConvertRule,
     _NonLLMConvertRule,
@@ -80,9 +69,8 @@ ALL_RULES = [
     _RAGConvertRule,
     _RetrieveRule,
     _Rule,
+    _SplitConvertRule,
     _TokenReducedConvertBondedRule,
-    _TokenReducedConvertConventionalRule,
-    _TokenReducedConvertRule,
     _TransformationRule,
 ]
@@ -90,7 +78,7 @@ IMPLEMENTATION_RULES = [
     rule
     for rule in ALL_RULES
     if issubclass(rule, _ImplementationRule)
-    and rule not in [_CodeSynthesisConvertRule, _ImplementationRule, _LLMConvertRule, _TokenReducedConvertRule]
+    and rule not in [_CodeSynthesisConvertRule, _ImplementationRule]
 ]
 TRANSFORMATION_RULES = [

palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl