PyPI - palimpzest - Versions diffs - 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

palimpzest 0.5.4py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
palimpzest-0.6.1.dist-info/RECORD +87 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.4.dist-info/RECORD +0 -83
palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0

palimpzest/query/operators/code_synthesis_convert.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Any
 from palimpzest.constants import Cardinality, GPT_4o_MODEL_CARD, Model
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.prompts import ADVICEGEN_PROMPT, CODEGEN_PROMPT, EXAMPLE_PROMPT
 from palimpzest.query.generators.generators import code_ensemble_execution, generator_factory
 from palimpzest.query.operators.convert import LLMConvert, LLMConvertBonded, LLMConvertConventional
@@ -26,18 +25,18 @@ class CodeSynthesisConvert(LLMConvert):
         exemplar_generation_model: Model = Model.GPT_4o,
         code_synth_model: Model = Model.GPT_4o,
         conventional_fallback_model: Model = Model.GPT_4o_MINI,
-        cache_across_plans: bool = True,
         *args,
         **kwargs,
     ):
         kwargs["model"] = None
         super().__init__(*args, **kwargs)
+        # set models
         self.exemplar_generation_model = exemplar_generation_model
         self.code_synth_model = code_synth_model
         self.conventional_fallback_model = conventional_fallback_model
-        self.cache_across_plans = cache_across_plans
-        # initialize optimization-specific parameters
+        # initialize parameters
         self.field_to_code_ensemble = None
         self.exemplars = []
         self.code_synthesized = False
@@ -47,15 +46,6 @@ class CodeSynthesisConvert(LLMConvert):
             cardinality=Cardinality.ONE_TO_ONE,
             verbose=self.verbose,
         )
-        # read the list of exemplars already generated by this operator if present
-        if self.cache_across_plans:
-            cache = DataDirectory().get_cache_service()
-            exemplars_cache_id = self.get_op_id()
-            exemplars = cache.get_cached_data("codeExemplars", exemplars_cache_id)
-            # set and return exemplars if it is not empty
-            if exemplars is not None and isinstance(exemplars, list) and len(exemplars) > 0:
-                self.exemplars = exemplars
         self.field_to_code_ensemble = {}
     def __str__(self):
@@ -80,7 +70,6 @@ class CodeSynthesisConvert(LLMConvert):
             "exemplar_generation_model": self.exemplar_generation_model,
             "code_synth_model": self.code_synth_model,
             "conventional_fallback_model": self.conventional_fallback_model,
-            "cache_across_plans": self.cache_across_plans,
             **op_params,
         }
@@ -109,23 +98,6 @@ class CodeSynthesisConvert(LLMConvert):
         return naive_op_cost_estimates
-    def _fetch_cached_code(self, fields_to_generate: list[str]) -> dict[CodeName, Code]:
-        # if we are allowed to cache synthesized code across plan executions, check the cache
-        field_to_code_ensemble = {}
-        cache = DataDirectory().get_cache_service()
-        for field_name in fields_to_generate:
-            code_ensemble_cache_id = "_".join([self.get_op_id(), field_name])
-            code_ensemble = cache.get_cached_data("codeEnsembles", code_ensemble_cache_id)
-            if code_ensemble is not None:
-                field_to_code_ensemble[field_name] = code_ensemble
-        # set and return field_to_code_ensemble if all fields are present and have code
-        if all([field_to_code_ensemble.get(field_name) is not None for field_name in fields_to_generate]):
-            self.field_to_code_ensemble = field_to_code_ensemble
-            return self.field_to_code_ensemble
-        else:
-            return {}
     def _should_synthesize(
         self, exemplars: list[Exemplar], num_exemplars: int = 1, code_regenerate_frequency: int = 200, *args, **kwargs
     ) -> bool:
@@ -168,12 +140,6 @@ class CodeSynthesisConvert(LLMConvert):
             field_to_code_ensemble[field_name] = code_ensemble
             generation_stats += code_synth_stats
-            # add code ensemble to the cache
-            if self.cache_across_plans:
-                cache = DataDirectory().get_cache_service()
-                code_ensemble_cache_id = "_".join([self.get_op_id(), field_name])
-                cache.put_cached_data("codeEnsembles", code_ensemble_cache_id, code_ensemble)
             if self.verbose:
                 for code_name, code in code_ensemble.items():
                     print(f"CODE NAME: {code_name}")
@@ -184,7 +150,7 @@ class CodeSynthesisConvert(LLMConvert):
         return field_to_code_ensemble, generation_stats
     def _bonded_query_fallback(self, candidate: DataRecord) -> tuple[dict[FieldName, list[Any] | None], GenerationStats]:
-        fields_to_generate = self.get_fields_to_generate(candidate, self.input_schema, self.output_schema)
+        fields_to_generate = self.get_fields_to_generate(candidate)
         projected_candidate = candidate.copy(include_bytes=False, project_cols=self.depends_on)
         # execute the bonded convert
@@ -209,12 +175,6 @@ class CodeSynthesisConvert(LLMConvert):
         exemplars = [(projected_candidate.to_dict(include_bytes=False), dr.to_dict(include_bytes=False)) for dr in drs]
         self.exemplars.extend(exemplars)
-        # if we are allowed to cache exemplars across plan executions, add exemplars to cache
-        if self.cache_across_plans:
-            cache = DataDirectory().get_cache_service()
-            exemplars_cache_id = self.get_op_id()
-            cache.put_cached_data("codeExemplars", exemplars_cache_id, exemplars)
         return field_answers, generation_stats
     def is_image_conversion(self):
@@ -231,10 +191,6 @@ class CodeSynthesisConvert(LLMConvert):
             self.field_to_code_ensemble, total_code_synth_stats = self.synthesize_code_ensemble(fields, candidate)
             self.code_synthesized = True
             generation_stats += total_code_synth_stats
-        else:
-            # read the dictionary of ensembles already synthesized by this operator if present
-            if self.cache_across_plans:
-                self.field_to_code_ensemble = self._fetch_cached_code(fields)
         # if we have yet to synthesize code (perhaps b/c we are waiting for more exemplars),
         # use the exemplar generation model to perform the convert (and generate high-quality

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -120,7 +120,7 @@ class ConvertOp(PhysicalOperator, ABC):
             RecordOpStats(
                 record_id=dr.id,
                 record_parent_id=dr.parent_id,
-                record_source_id=dr.source_id,
+                record_source_idx=dr.source_idx,
                 record_state=dr.to_dict(include_bytes=False),
                 op_id=self.get_op_id(),
                 logical_op_id=self.logical_op_id,
@@ -183,7 +183,7 @@ class ConvertOp(PhysicalOperator, ABC):
         start_time = time.time()
         # get fields to generate with this convert
-        fields_to_generate = self.get_fields_to_generate(candidate, self.input_schema, self.output_schema)
+        fields_to_generate = self.get_fields_to_generate(candidate)
         # execute the convert
         field_answers: dict[str, list]
@@ -276,6 +276,9 @@ class NonLLMConvert(ConvertOp):
 class LLMConvert(ConvertOp):
+    """
+    This is the base class for convert operations which use an LLM to generate the output fields.
+    """
     def __init__(
         self,
         model: Model,

palimpzest/query/operators/critique_and_refine_convert.py ADDED Viewed

@@ -0,0 +1,112 @@
+from __future__ import annotations
+from typing import Any
+from palimpzest.constants import MODEL_CARDS, Model, PromptStrategy
+from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
+from palimpzest.core.elements.records import DataRecord
+from palimpzest.query.generators.generators import generator_factory
+from palimpzest.query.operators.convert import LLMConvert
+# TYPE DEFINITIONS
+FieldName = str
+class CriticAndRefineConvert(LLMConvert):
+    def __init__(
+        self,
+        critic_model: Model,
+        refine_model: Model,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.critic_model = critic_model
+        self.refine_model = refine_model
+        if self.prompt_strategy == PromptStrategy.COT_QA:
+            self.critic_prompt_strategy = PromptStrategy.COT_QA_CRITIC
+            self.refinement_prompt_strategy = PromptStrategy.COT_QA_REFINE
+        elif self.prompt_strategy == PromptStrategy.COT_QA_IMAGE:
+            self.critic_prompt_strategy = PromptStrategy.COT_QA_IMAGE_CRITIC
+            self.refinement_prompt_strategy = PromptStrategy.COT_QA_IMAGE_REFINE
+        else:
+            raise ValueError(f"Unsupported prompt strategy: {self.prompt_strategy}")
+        # create generators
+        self.critic_generator = generator_factory(self.critic_model, self.critic_prompt_strategy, self.cardinality, self.verbose)
+        self.refine_generator = generator_factory(self.refine_model, self.refinement_prompt_strategy, self.cardinality, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Critic Model: {self.critic_model}\n"
+        op += f"    Critic Prompt Strategy: {self.critic_prompt_strategy}\n"
+        op += f"    Refine Model: {self.refine_model}\n"
+        op += f"    Refinement Prompt Strategy: {self.refinement_prompt_strategy}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "critic_model": self.critic_model.value,
+            "refine_model": self.refine_model.value,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "critic_model": self.critic_model,
+            "refine_model": self.refine_model,
+            **op_params,
+        }
+        return op_params
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Currently, we are invoking `self.model`, then critiquing its output with `self.critic_model`, and
+        finally refining the output with `self.refine_model`. Thus, we roughly expect to incur the cost
+        and time of three LLMConverts. In practice, this naive quality estimate will be overwritten by the
+        CostModel's estimate once it executes a few instances of the operator.
+        """
+        # get naive cost estimates for first LLM call and multiply by 3 for now;
+        # of course we should sum individual estimates for each model, but this is a rough estimate
+        # and in practice we will need to revamp our naive cost estimates in the near future
+        naive_op_cost_estimates = 3 * super().naive_cost_estimates(source_op_cost_estimates)
+        # for naive setting, estimate quality as quality of refine model
+        model_quality = MODEL_CARDS[self.refine_model.value]["overall"] / 100.0
+        naive_op_cost_estimates.quality = model_quality
+        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
+        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        return naive_op_cost_estimates
+    def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+        # get input fields
+        input_fields = self.get_input_fields()
+        # NOTE: when I merge in the `abacus` branch, I will want to update this to reflect the changes I made to reasoning extraction
+        # execute the initial model
+        original_gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema}
+        field_answers, reasoning, original_gen_stats = self.generator(candidate, fields, **original_gen_kwargs)
+        original_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
+        original_messages = self.generator.get_messages()
+        # execute the critic model
+        critic_gen_kwargs = {"original_output": original_output, "original_messages": original_messages, **original_gen_kwargs}
+        field_answers, reasoning, critic_gen_stats = self.critic_generator(candidate, fields, **critic_gen_kwargs)
+        critique_output = f"REASONING: {reasoning}\nANSWER:{field_answers}\n"
+        # execute the refinement model
+        refine_gen_kwargs = {"critique_output": critique_output, **critic_gen_kwargs}
+        field_answers, reasoning, refine_gen_stats = self.refine_generator(candidate, fields, **refine_gen_kwargs)
+        # compute the total generation stats
+        generation_stats = original_gen_stats + critic_gen_stats + refine_gen_stats
+        return field_answers, generation_stats

palimpzest/query/operators/filter.py CHANGED Viewed

@@ -81,7 +81,7 @@ class FilterOp(PhysicalOperator, ABC):
         record_op_stats = RecordOpStats(
             record_id=dr.id,
             record_parent_id=dr.parent_id,
-            record_source_id=dr.source_id,
+            record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
             op_id=self.get_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/limit.py CHANGED Viewed

@@ -42,7 +42,7 @@ class LimitScanOp(PhysicalOperator):
         record_op_stats = RecordOpStats(
             record_id=dr.id,
             record_parent_id=dr.parent_id,
-            record_source_id=dr.source_id,
+            record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
             op_id=self.get_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/logical.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 from typing import Callable
 from palimpzest.constants import AggFunc, Cardinality
+from palimpzest.core.data.datareaders import DataReader
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.lib.schemas import Schema
@@ -15,7 +16,7 @@ class LogicalOperator:
     A logical operator is an operator that operates on Sets.
     Right now it can be one of:
-    - BaseScan (scans data from DataSource)
+    - BaseScan (scans data from DataReader)
     - CacheScan (scans cached Set)
     - FilteredScan (scans input Set and applies filter)
     - ConvertScan (scans input Set and converts it to new Schema)
@@ -38,6 +39,14 @@ class LogicalOperator:
         self.input_schema = input_schema
         self.logical_op_id: str | None = None
+        # compute the fields generated by this logical operator
+        input_field_names = self.input_schema.field_names() if self.input_schema is not None else []
+        self.generated_fields = sorted([
+            field_name
+            for field_name in self.output_schema.field_names()
+            if field_name not in input_field_names
+        ])
     def __str__(self) -> str:
         raise NotImplementedError("Abstract method")
@@ -58,9 +67,10 @@ class LogicalOperator:
         for computing the logical operator id.
         NOTE: Should be overriden by subclasses to include class-specific parameters.
-        NOTE: input_schema is not included in the id params because it depends on how the Optimizer orders operations.
+        NOTE: input_schema and output_schema are not included in the id params because
+              they depend on how the Optimizer orders operations.
         """
-        return {"output_schema": self.output_schema}
+        return {"generated_fields": self.generated_fields}
     def get_logical_op_params(self) -> dict:
         """
@@ -137,30 +147,27 @@ class Aggregate(LogicalOperator):
 class BaseScan(LogicalOperator):
     """A BaseScan is a logical operator that represents a scan of a particular data source."""
-    def __init__(self, dataset_id: str, output_schema: Schema):
+    def __init__(self, datareader: DataReader, output_schema: Schema):
         super().__init__(output_schema=output_schema)
-        self.dataset_id = dataset_id
+        self.datareader = datareader
     def __str__(self):
-        return f"BaseScan({self.dataset_id},{str(self.output_schema)})"
+        return f"BaseScan({self.datareader},{self.output_schema})"
     def __eq__(self, other) -> bool:
         return (
             isinstance(other, BaseScan)
             and self.input_schema.get_desc() == other.input_schema.get_desc()
             and self.output_schema.get_desc() == other.output_schema.get_desc()
-            and self.dataset_id == other.dataset_id
+            and self.datareader == other.datareader
         )
     def get_logical_id_params(self) -> dict:
-        logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"dataset_id": self.dataset_id, **logical_id_params}
-        return logical_id_params
+        return super().get_logical_id_params()
     def get_logical_op_params(self) -> dict:
         logical_op_params = super().get_logical_op_params()
-        logical_op_params = {"dataset_id": self.dataset_id, **logical_op_params}
+        logical_op_params = {"datareader": self.datareader, **logical_op_params}
         return logical_op_params
@@ -168,28 +175,19 @@ class BaseScan(LogicalOperator):
 class CacheScan(LogicalOperator):
     """A CacheScan is a logical operator that represents a scan of a cached Set."""
-    def __init__(self, dataset_id: str, *args, **kwargs):
-        if kwargs.get("input_schema") is not None:
-            raise Exception(
-                f"CacheScan must be initialized with `input_schema=None` but was initialized with "
-                f"`input_schema={kwargs.get('input_schema')}`"
-            )
-        super().__init__(*args, **kwargs)
-        self.dataset_id = dataset_id
+    def __init__(self, datareader: DataReader, output_schema: Schema):
+        super().__init__(output_schema=output_schema)
+        self.datareader = datareader
     def __str__(self):
-        return f"CacheScan({str(self.output_schema)},{str(self.dataset_id)})"
+        return f"CacheScan({self.datareader},{self.output_schema})"
     def get_logical_id_params(self) -> dict:
-        logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"dataset_id": self.dataset_id, **logical_id_params}
-        return logical_id_params
+        return super().get_logical_id_params()
     def get_logical_op_params(self) -> dict:
         logical_op_params = super().get_logical_op_params()
-        logical_op_params = {"dataset_id": self.dataset_id, **logical_op_params}
+        logical_op_params = {"datareader": self.datareader, **logical_op_params}
         return logical_op_params
@@ -374,6 +372,7 @@ class RetrieveScan(LogicalOperator):
     def __init__(
         self,
         index,
+        search_func,
         search_attr,
         output_attr,
         k,
@@ -383,6 +382,7 @@ class RetrieveScan(LogicalOperator):
     ):
         super().__init__(*args, **kwargs)
         self.index = index
+        self.search_func = search_func
         self.search_attr = search_attr
         self.output_attr = output_attr
         self.k = k
@@ -409,6 +409,7 @@ class RetrieveScan(LogicalOperator):
         logical_op_params = super().get_logical_op_params()
         logical_op_params = {
             "index": self.index,
+            "search_func": self.search_func,
             "search_attr": self.search_attr,
             "output_attr": self.output_attr,
             "k": self.k,

palimpzest/query/operators/mixture_of_agents_convert.py CHANGED Viewed

@@ -84,7 +84,7 @@ class MixtureOfAgentsConvert(LLMConvert):
         answers, which are then aggregated and summarized by a single aggregator model. Thus, we
         roughly expect to incur the cost and time of an LLMConvert * (len(proposer_models) + 1).
         In practice, this naive quality estimate will be overwritten by the CostModel's estimate
-        once it executes a few code generated examples.
+        once it executes a few instances of the operator.
         """
         # temporarily set self.model so that super().naive_cost_estimates(...) can compute an estimate
         self.model = self.proposer_models[0]
@@ -107,6 +107,9 @@ class MixtureOfAgentsConvert(LLMConvert):
         naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
         naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
+        # reset self.model to be None
+        self.model = None
         return naive_op_cost_estimates
     def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -5,7 +5,6 @@ import json
 from palimpzest.core.data.dataclasses import OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
 from palimpzest.core.lib.schemas import Schema
-from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -36,7 +35,14 @@ class PhysicalOperator:
         self.target_cache_id = target_cache_id
         self.verbose = verbose
         self.op_id = None
-        self.datadir = DataDirectory()
+        # compute the fields generated by this physical operator
+        input_field_names = self.input_schema.field_names() if self.input_schema is not None else []
+        self.generated_fields = sorted([
+            field_name
+            for field_name in self.output_schema.field_names()
+            if field_name not in input_field_names
+        ])
         # sets __hash__() for each child Operator to be the base class' __hash__() method;
         # by default, if a subclass defines __eq__() but not __hash__() Python will set that
@@ -68,9 +74,12 @@ class PhysicalOperator:
         for computing the physical operator id.
         NOTE: Should be overriden by subclasses to include class-specific parameters.
-        NOTE: input_schema is not included in the id params because it depends on how the Optimizer orders operations.
+        NOTE: input_schema and output_schema are not included in the id params by default,
+              because they may depend on the order of operations chosen by the Optimizer.
+              This is particularly true for convert operations, where the output schema
+              is now the union of the input and output schemas of the logical operator.
         """
-        return {"output_schema": self.output_schema}
+        return {"generated_fields": self.generated_fields}
     def get_op_params(self) -> dict:
         """
@@ -106,7 +115,10 @@ class PhysicalOperator:
         # get op name and op parameters which are relevant for computing the id
         op_name = self.op_name()
         id_params = self.get_id_params()
-        id_params = {k: str(v) for k, v in id_params.items()}
+        id_params = {
+            k: str(v) if k != "output_schema" else sorted(v.field_names())
+            for k, v in id_params.items()
+        }
         # compute, set, and return the op_id
         hash_str = json.dumps({"op_name": op_name, **id_params}, sort_keys=True)
@@ -136,20 +148,20 @@ class PhysicalOperator:
         return input_fields
-    def get_fields_to_generate(self, candidate: DataRecord, input_schema: Schema, output_schema: Schema) -> list[str]:
+    def get_fields_to_generate(self, candidate: DataRecord) -> list[str]:
         """
-        Creates the list of field names that an operation needs to generate. Right now this is only used
-        by convert and retrieve operators.
+        Returns the list of field names that this operator needs to generate for the given candidate.
+        This function returns only the fields in self.generated_fields which are not already present
+        in the candidate. This is important for operators with retry logic, where we may only need to
+        recompute a subset of self.generated_fields.
+        Right now this is only used by convert and retrieve operators.
         """
-        # construct the list of fields in output_schema which will need to be generated;
-        # specifically, this is the set of fields which are:
-        # 1. not declared in the input schema, and
-        # 2. not present in the candidate's attributes
-        #    a. if the field is present, but its value is None --> we will try to generate it
-        fields_to_generate = []
-        for field_name in output_schema.field_names():
-            if field_name not in input_schema.field_names() and getattr(candidate, field_name, None) is None:
-                fields_to_generate.append(field_name)
+        fields_to_generate = [
+            field_name
+            for field_name in self.generated_fields
+            if getattr(candidate, field_name, None) is None
+        ]
         return fields_to_generate
@@ -168,8 +180,8 @@ class PhysicalOperator:
         when PZ does not have sample execution data -- and it will be necessary
         in some cases even when sample execution data is present. (For example,
         the cardinality of each operator cannot be estimated based on sample
-        execution data alone -- thus DataSourcePhysicalOps need to give
-        at least ballpark correct estimates of this quantity).
+        execution data alone -- thus ScanPhysicalOps need to give at least ballpark
+        correct estimates of this quantity).
         """
         raise NotImplementedError("CostEstimates from abstract method")
@@ -177,7 +189,7 @@ class PhysicalOperator:
         raise NotImplementedError("Calling __call__ from abstract method")
     @staticmethod
-    def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]:
+    def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord] | int) -> tuple[DataRecordSet, PhysicalOperator]:
         """
         Wrapper function around operator execution which also and returns the operator.
         This is useful in the parallel setting(s) where operators are executed by a worker pool,

palimpzest/query/operators/project.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ProjectOp(PhysicalOperator):
         record_op_stats = RecordOpStats(
             record_id=dr.id,
             record_parent_id=dr.parent_id,
-            record_source_id=dr.source_id,
+            record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
             op_id=self.get_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/rag_convert.py CHANGED Viewed

@@ -12,7 +12,7 @@ from palimpzest.constants import (
 )
 from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.fields import ListField, StringField
+from palimpzest.core.lib.fields import StringField
 from palimpzest.query.operators.convert import FieldName, LLMConvert
@@ -20,7 +20,7 @@ class RAGConvert(LLMConvert):
     def __init__(self, num_chunks_per_field: int, chunk_size: int = 1000, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # NOTE: in the future, we should abstract the embedding model to allow for different models
-        self.client = OpenAI()
+        self.client = None
         self.embedding_model = "text-embedding-3-small"
         self.num_chunks_per_field = num_chunks_per_field
         self.chunk_size = chunk_size
@@ -124,7 +124,7 @@ class RAGConvert(LLMConvert):
             # skip this field if it is not a string or a list of strings
             is_string_field = isinstance(field, StringField)
-            is_list_string_field = isinstance(field, ListField) and isinstance(field.element_type, StringField)
+            is_list_string_field = hasattr(field, "element_type") and isinstance(field.element_type, StringField)
             if not (is_string_field or is_list_string_field):
                 continue
@@ -157,6 +157,9 @@ class RAGConvert(LLMConvert):
         return candidate
     def convert(self, candidate: DataRecord, fields: list[str]) -> tuple[dict[FieldName, list[Any]], GenerationStats]:
+        # set client
+        self.client = OpenAI() if self.client is None else self.client
         # get the set of input fields to use for the convert operation
         input_fields = self.get_input_fields()

palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl

palimpzest 0.5.4py3-none-any.whl → 0.6.1py3-none-any.whl