PyPI - palimpzest - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

palimpzest 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
palimpzest-0.6.0.dist-info/RECORD +87 -0
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.3.dist-info/RECORD +0 -83
palimpzest-0.5.3.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0

palimpzest/query/operators/retrieve.py CHANGED Viewed

@@ -9,9 +9,10 @@ from palimpzest.query.operators.physical import PhysicalOperator
 class RetrieveOp(PhysicalOperator):
-    def __init__(self, index, search_attr, output_attr, k, *args, **kwargs):
+    def __init__(self, index, search_func, search_attr, output_attr, k, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.index = index
+        self.search_func = search_func
         self.search_attr = search_attr
         self.output_attr = output_attr
         self.k = k
@@ -36,6 +37,7 @@ class RetrieveOp(PhysicalOperator):
         op_params = super().get_op_params()
         op_params = {
             "index": self.index,
+            "search_func": self.search_func,
             "search_attr": self.search_attr,
             "output_attr": self.output_attr,
             "k": self.k,
@@ -61,51 +63,31 @@ class RetrieveOp(PhysicalOperator):
         query = getattr(candidate, self.search_attr)
-        top_k_results, top_k_result_doc_ids = [], []
-        if isinstance(query, str):
-            results = self.index.search(query, k=self.k)
-            top_k_results = [result["content"] for result in results]
-            # This is hacky, fix this later.
-            top_k_result_doc_ids = list({result["document_id"] for result in results})
-        elif isinstance(query, list):
-            try:
-                # retrieve top entry for each query
-                results = self.index.search(query, k=1)
-                # filter for the top-k entries
-                results = [result[0] if isinstance(result, list) else result for result in results]
-                sorted_results = sorted(results, key=lambda result: result["score"], reverse=True)
-                top_k_results = [result["content"] for result in sorted_results[:self.k]]
-                top_k_result_doc_ids = [result["document_id"] for result in sorted_results[:self.k]]
-            except Exception:
-                os.makedirs("retrieve-errors", exist_ok=True)
-                ts = time.time()
-                with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
-                    f.write(str(query))
-                top_k_results = ["error-in-retrieve"]
-                top_k_result_doc_ids = ["error-in-retrieve"]
+        try:
+            top_k_results = self.search_func(self.index, query, self.k)
+        except Exception:
+            top_k_results = ["error-in-retrieve"]
+            os.makedirs("retrieve-errors", exist_ok=True)
+            ts = time.time()
+            with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
+                f.write(str(query))
         output_dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
         setattr(output_dr, self.output_attr, top_k_results)
-        output_dr._evidence_file_ids = top_k_result_doc_ids
         duration_secs = time.time() - start_time
         answer = {self.output_attr: top_k_results}
         record_state = output_dr.to_dict(include_bytes=False)
-        record_state["_evidence_file_ids"] = top_k_result_doc_ids
         # NOTE: right now this should be equivalent to [self.output_attr], but in the future we may
         #       want to support the RetrieveOp generating multiple fields. (Also, the function will
         #       return the full field name (as opposed to the short field name))
-        generated_fields = self.get_fields_to_generate(candidate, self.input_schema, self.output_schema)
+        generated_fields = self.get_fields_to_generate(candidate)
         record_op_stats = RecordOpStats(
             record_id=output_dr.id,
             record_parent_id=output_dr.parent_id,
-            record_source_id=output_dr.source_id,
+            record_source_idx=output_dr.source_idx,
             record_state=record_state,
             op_id=self.get_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest/query/operators/scan.py ADDED Viewed

@@ -0,0 +1,150 @@
+from __future__ import annotations
+import time
+from abc import ABC, abstractmethod
+from palimpzest.constants import (
+    LOCAL_SCAN_TIME_PER_KB,
+    MEMORY_SCAN_TIME_PER_KB,
+    Cardinality,
+)
+from palimpzest.core.data.dataclasses import OperatorCostEstimates, RecordOpStats
+from palimpzest.core.data.datareaders import DataReader, DirectoryReader, FileReader
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.query.operators.physical import PhysicalOperator
+class ScanPhysicalOp(PhysicalOperator, ABC):
+    """
+    Physical operators which implement DataReaders require slightly more information
+    in order to accurately compute naive cost estimates. Thus, we use a slightly
+    modified abstract base class for these operators.
+    """
+    def __init__(self, datareader: DataReader, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.datareader = datareader
+    def __str__(self):
+        op = f"{self.op_name()}({self.datareader}) -> {self.output_schema}\n"
+        op += f"    ({', '.join(self.output_schema.field_names())[:30]})\n"
+        return op
+    def get_id_params(self):
+        return super().get_id_params()
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"datareader": self.datareader, **op_params}
+    @abstractmethod
+    def naive_cost_estimates(
+        self,
+        source_op_cost_estimates: OperatorCostEstimates,
+        input_cardinality: Cardinality,
+        input_record_size_in_bytes: int | float,
+    ) -> OperatorCostEstimates:
+        """
+        This function returns a naive estimate of this operator's:
+        - cardinality
+        - time_per_record
+        - cost_per_record
+        - quality
+        For the implemented operator. These will be used by the CostModel
+        when PZ does not have sample execution data -- and it will be necessary
+        in some cases even when sample execution data is present. (For example,
+        the cardinality of each operator cannot be estimated based on sample
+        execution data alone -- thus ScanPhysicalOps need to give
+        at least ballpark correct estimates of this quantity).
+        """
+        pass
+    def __call__(self, idx: int) -> DataRecordSet:
+        """
+        This function invokes `self.datareader.__getitem__` on the given `idx` to retrieve the next data item.
+        It then returns this item as a DataRecord wrapped in a DataRecordSet.
+        """
+        start_time = time.time()
+        item = self.datareader[idx]
+        end_time = time.time()
+        # check that item covers fields in output schema
+        output_field_names = self.output_schema.field_names()
+        assert all([field in item for field in output_field_names]), f"Some fields in DataReader schema not present in item!\n - DataReader fields: {output_field_names}\n - Item fields: {list(item.keys())}"
+        # construct a DataRecord from the item
+        dr = DataRecord(self.output_schema, source_idx=idx)
+        for field in output_field_names:
+            setattr(dr, field, item[field])
+        # create RecordOpStats objects
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_id=dr.parent_id,
+            record_source_idx=dr.source_idx,
+            record_state=dr.to_dict(include_bytes=False),
+            op_id=self.get_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=(end_time - start_time),
+            cost_per_record=0.0,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        # construct and return DataRecordSet object
+        return DataRecordSet([dr], [record_op_stats])
+class MarshalAndScanDataOp(ScanPhysicalOp):
+    def naive_cost_estimates(
+        self,
+        source_op_cost_estimates: OperatorCostEstimates,
+        input_record_size_in_bytes: int | float,
+    ) -> OperatorCostEstimates:
+        # get inputs needed for naive cost estimation
+        # TODO: we should rename cardinality --> "multiplier" or "selectivity" one-to-one / one-to-many
+        # estimate time spent reading each record
+        per_record_size_kb = input_record_size_in_bytes / 1024.0
+        time_per_record = (
+            LOCAL_SCAN_TIME_PER_KB * per_record_size_kb
+            if isinstance(self.datareader, (DirectoryReader, FileReader))
+            else MEMORY_SCAN_TIME_PER_KB * per_record_size_kb
+        )
+        # estimate output cardinality
+        cardinality = source_op_cost_estimates.cardinality
+        # for now, assume no cost per record for reading data
+        return OperatorCostEstimates(
+            cardinality=cardinality,
+            time_per_record=time_per_record,
+            cost_per_record=0,
+            quality=1.0,
+        )
+class CacheScanDataOp(ScanPhysicalOp):
+    def naive_cost_estimates(
+        self,
+        source_op_cost_estimates: OperatorCostEstimates,
+        input_record_size_in_bytes: int | float,
+    ):
+        # get inputs needed for naive cost estimation
+        # TODO: we should rename cardinality --> "multiplier" or "selectivity" one-to-one / one-to-many
+        # estimate time spent reading each record
+        per_record_size_kb = input_record_size_in_bytes / 1024.0
+        time_per_record = LOCAL_SCAN_TIME_PER_KB * per_record_size_kb
+        # estimate output cardinality
+        cardinality = source_op_cost_estimates.cardinality
+        # for now, assume no cost per record for reading from cache
+        return OperatorCostEstimates(
+            cardinality=cardinality,
+            time_per_record=time_per_record,
+            cost_per_record=0,
+            quality=1.0,
+        )

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -10,6 +10,9 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     CodeSynthesisConvertSingleRule as _CodeSynthesisConvertSingleRule,
 )
+from palimpzest.query.optimizer.rules import (
+    CriticAndRefineConvertRule as _CriticAndRefineConvertRule,
+)
 from palimpzest.query.optimizer.rules import (
     ImplementationRule as _ImplementationRule,
 )
@@ -64,6 +67,7 @@ ALL_RULES = [
     _BasicSubstitutionRule,
     _CodeSynthesisConvertRule,
     _CodeSynthesisConvertSingleRule,
+    _CriticAndRefineConvertRule,
     _ImplementationRule,
     _LLMConvertBondedRule,
     _LLMConvertConventionalRule,
@@ -86,7 +90,7 @@ IMPLEMENTATION_RULES = [
     rule
     for rule in ALL_RULES
     if issubclass(rule, _ImplementationRule)
-    and rule not in [_CodeSynthesisConvertRule, _ImplementationRule, _LLMConvertRule, _RAGConvertRule, _TokenReducedConvertRule]
+    and rule not in [_CodeSynthesisConvertRule, _ImplementationRule, _LLMConvertRule, _TokenReducedConvertRule]
 ]
 TRANSFORMATION_RULES = [

palimpzest/query/optimizer/cost_model.py CHANGED Viewed

@@ -14,18 +14,17 @@ from typing import Any
 import pandas as pd
 import scipy.stats as stats
-from palimpzest.constants import MODEL_CARDS, GPT_4o_MODEL_CARD, Model
+from palimpzest.constants import MODEL_CARDS, NAIVE_BYTES_PER_RECORD, GPT_4o_MODEL_CARD, Model
 from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats
 from palimpzest.core.elements.records import DataRecordSet
-from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
 from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
 from palimpzest.query.operators.convert import LLMConvert
-from palimpzest.query.operators.datasource import CacheScanDataOp, DataSourcePhysicalOp, MarshalAndScanDataOp
 from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.rag_convert import RAGConvert
+from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
 from palimpzest.query.operators.token_reduction_convert import TokenReducedConvert
 from palimpzest.query.optimizer.plan import SentinelPlan
 from palimpzest.utils.model_helpers import get_champion_model_name, get_models
@@ -89,10 +88,6 @@ class SampleBasedCostModel:
             for phys_op_id, _ in phys_op_id_to_stats.items()
         ])
-        # reference to data directory
-        self.datadir = DataDirectory()
-        # import pdb; pdb.set_trace()
     def get_costed_phys_op_ids(self):
         return self.costed_phys_op_ids
@@ -131,9 +126,9 @@ class SampleBasedCostModel:
                     "time_per_record": record_op_stats.time_per_record,
                     "quality": record_op_stats.quality,
                     "passed_operator": record_op_stats.passed_operator,
-                    "source_id": record_op_stats.record_source_id,  # TODO: remove
-                    "op_details": record_op_stats.op_details,       # TODO: remove
-                    "answer": record_op_stats.answer,               # TODO: remove
+                    "source_idx": record_op_stats.record_source_idx,  # TODO: remove
+                    "op_details": record_op_stats.op_details,         # TODO: remove
+                    "answer": record_op_stats.answer,                 # TODO: remove
                 }
                 execution_record_op_stats.append(record_op_stats_dict)
@@ -189,14 +184,13 @@ class SampleBasedCostModel:
         est_quality = self.operator_to_stats[logical_op_id][phys_op_id]["quality"]
         est_selectivity = self.operator_to_stats[logical_op_id][phys_op_id]["selectivity"]
-        # create source_op_estimates for datasources if they are not provided
-        if isinstance(operator, DataSourcePhysicalOp):
-            # get handle to DataSource and pre-compute its size (number of records)
-            datasource = operator.get_datasource()
-            datasource_len = len(datasource)
+        # create source_op_estimates for scan operators if they are not provided
+        if isinstance(operator, ScanPhysicalOp):
+            # get handle to scan operator and pre-compute its size (number of records)
+            datareader_len = len(operator.datareader)
             source_op_estimates = OperatorCostEstimates(
-                cardinality=datasource_len,
+                cardinality=datareader_len,
                 time_per_record=0.0,
                 cost_per_record=0.0,
                 quality=1.0,
@@ -245,9 +239,6 @@ class CostModel(BaseCostModel):
         # df contains a column called record_state, that sometimes contain a dict
         # we want to extract the keys from the dict and create a new column for each key
-        # reference to data directory
-        self.datadir = DataDirectory()
         # set available models
         self.available_models = available_models
@@ -610,36 +601,29 @@ class CostModel(BaseCostModel):
         # initialize estimates of operator metrics based on naive (but sometimes precise) logic
         if isinstance(operator, MarshalAndScanDataOp):
-            # get handle to DataSource and pre-compute its size (number of records)
-            datasource = operator.get_datasource()
-            dataset_type = operator.get_datasource_type()
-            datasource_len = len(datasource)
-            datasource_memsize = datasource.get_size()
+            # get handle to scan operator and pre-compute its size (number of records)
+            datareader_len = len(operator.datareader)
             source_op_estimates = OperatorCostEstimates(
-                cardinality=datasource_len,
+                cardinality=datareader_len,
                 time_per_record=0.0,
                 cost_per_record=0.0,
                 quality=1.0,
             )
-            op_estimates = operator.naive_cost_estimates(source_op_estimates,
-                                                    input_record_size_in_bytes=datasource_memsize/datasource_len,
-                                                    dataset_type=dataset_type)
+            op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=NAIVE_BYTES_PER_RECORD)
         elif isinstance(operator, CacheScanDataOp):
-            datasource = operator.get_datasource()
-            datasource_len = len(datasource)
-            datasource_memsize = datasource.get_size()
+            datareader_len = len(operator.datareader)
             source_op_estimates = OperatorCostEstimates(
-                cardinality=datasource_len,
+                cardinality=datareader_len,
                 time_per_record=0.0,
                 cost_per_record=0.0,
                 quality=1.0,
             )
-            op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=datasource_memsize/datasource_len)
+            op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=NAIVE_BYTES_PER_RECORD)
         else:
             op_estimates = operator.naive_cost_estimates(source_op_estimates)
@@ -660,7 +644,7 @@ class CostModel(BaseCostModel):
                 # NOTE: this cardinality is the only cardinality we estimate directly b/c we can observe how many groups are
                 #       produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
                 #       actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
-                #       the input cardinality (where the initial input cardinality from the datasource is known).
+                #       the input cardinality (where the initial input cardinality from the datareader is known).
                 op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
                 op_estimates.cardinality_lower_bound = op_estimates.cardinality
                 op_estimates.cardinality_upper_bound = op_estimates.cardinality

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -3,14 +3,12 @@ from __future__ import annotations
 from copy import deepcopy
 from palimpzest.constants import Model
-from palimpzest.core.data.datasources import DataSource
+from palimpzest.core.data.datareaders import DataReader
 from palimpzest.core.lib.fields import Field
-from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.policy import Policy
 from palimpzest.query.operators.logical import (
     Aggregate,
     BaseScan,
-    CacheScan,
     ConvertScan,
     FilteredScan,
     GroupByAggregate,
@@ -32,6 +30,7 @@ from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.query.optimizer.primitives import Group, LogicalExpression
 from palimpzest.query.optimizer.rules import (
     CodeSynthesisConvertRule,
+    CriticAndRefineConvertRule,
     LLMConvertBondedRule,
     LLMConvertConventionalRule,
     MixtureOfAgentsConvertRule,
@@ -48,9 +47,18 @@ from palimpzest.query.optimizer.tasks import (
     OptimizePhysicalExpression,
 )
 from palimpzest.sets import Dataset, Set
+from palimpzest.utils.hash_helpers import hash_for_serialized_dict
 from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_conventional_fallback_model
+def get_node_uid(node: Dataset | DataReader) -> str:
+    """Helper function to compute the universal identifier for a node in the query plan."""
+    # NOTE: technically, hash_for_serialized_dict(node.serialize()) would be valid for both DataReader and Dataset;
+    #       for the moment, I want to be explicit in Dataset about what constitutes a unique Dataset object, but
+    #       in ther future we may be able to remove universal_identifier() from Dataset and just use this function
+    return node.universal_identifier() if isinstance(node, Dataset) else hash_for_serialized_dict(node.serialize())
 class Optimizer:
     """
     The optimizer is responsible for searching the space of possible physical plans
@@ -85,8 +93,9 @@ class Optimizer:
         allow_conventional_query: bool = False,
         allow_code_synth: bool = False,
         allow_token_reduction: bool = False,
-        allow_rag_reduction: bool = True,
+        allow_rag_reduction: bool = False,
         allow_mixtures: bool = True,
+        allow_critic: bool = False,
         optimization_strategy_type: OptimizationStrategyType = OptimizationStrategyType.PARETO,
         use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality
     ):
@@ -129,6 +138,7 @@ class Optimizer:
             self.allow_token_reduction = False
             self.allow_rag_reduction = False
             self.allow_mixtures = False
+            self.allow_critic = False
             self.available_models = [available_models[0]]
         # store optimization hyperparameters
@@ -141,6 +151,7 @@ class Optimizer:
         self.allow_token_reduction = allow_token_reduction
         self.allow_rag_reduction = allow_rag_reduction
         self.allow_mixtures = allow_mixtures
+        self.allow_critic = allow_critic
         self.optimization_strategy_type = optimization_strategy_type
         self.use_final_op_quality = use_final_op_quality
@@ -180,10 +191,14 @@ class Optimizer:
                 if not issubclass(rule, MixtureOfAgentsConvertRule)
             ]
+        if not self.allow_critic:
+            self.implementation_rules = [
+                rule for rule in self.implementation_rules if not issubclass(rule, CriticAndRefineConvertRule)
+            ]
     def update_cost_model(self, cost_model: CostModel):
         self.cost_model = cost_model
     def get_physical_op_params(self):
         return {
             "verbose": self.verbose,
@@ -214,20 +229,22 @@ class Optimizer:
         self.strategy = OptimizerStrategyRegistry.get_strategy(optimizer_strategy_type.value)
     def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]:
-        # get node, output_schema, and input_schema(if applicable)
+        # get node, output_schema, and input_schema (if applicable)
         node = dataset_nodes[-1]
         output_schema = node.schema
         input_schema = dataset_nodes[-2].schema if len(dataset_nodes) > 1 else None
         ### convert node --> Group ###
-        uid = node.universal_identifier()
+        uid = get_node_uid(node)
         # create the op for the given node
         op: LogicalOperator | None = None
-        if not self.no_cache and DataDirectory().has_cached_answer(uid):
-            op = CacheScan(dataset_id=uid, input_schema=None, output_schema=output_schema)
-        elif isinstance(node, DataSource):
-            op = BaseScan(dataset_id=uid, output_schema=output_schema)
+        # TODO: add cache scan when we add caching back to PZ
+        # if not self.no_cache:
+        #     op = CacheScan(datareader=node, output_schema=output_schema)
+        if isinstance(node, DataReader):
+            op = BaseScan(datareader=node, output_schema=output_schema)
         elif node._filter is not None:
             op = FilteredScan(
                 input_schema=input_schema,
@@ -269,6 +286,7 @@ class Optimizer:
                 input_schema=input_schema,
                 output_schema=output_schema,
                 index=node._index,
+                search_func=node._search_func,
                 search_attr=node._search_attr,
                 output_attr=node._output_attr,
                 k=node._k,
@@ -283,6 +301,9 @@ class Optimizer:
                 depends_on=node._depends_on,
                 target_cache_id=uid,
             )
+        # some legacy plans may have a useless convert; for now we simply skip it
+        elif output_schema == input_schema:
+            return self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
         else:
             raise NotImplementedError(
                 f"""No logical operator exists for the specified dataset construction.
@@ -306,7 +327,7 @@ class Optimizer:
         # compute the set of (short) field names this operation depends on
         depends_on_field_names = (
             {}
-            if isinstance(node, DataSource)
+            if isinstance(node, DataReader)
             else {field_name.split(".")[-1] for field_name in node._depends_on}
         )
@@ -359,28 +380,22 @@ class Optimizer:
     def convert_query_plan_to_group_tree(self, query_plan: Dataset) -> str:
         # Obtain ordered list of datasets
-        dataset_nodes = []
-        node = query_plan.copy()
+        dataset_nodes: list[Dataset | DataReader] = []
+        node = deepcopy(query_plan)
+        # NOTE: the very first node will be a DataReader; the rest will be Dataset
         while isinstance(node, Dataset):
             dataset_nodes.append(node)
             node = node._source
         dataset_nodes.append(node)
         dataset_nodes = list(reversed(dataset_nodes))
-        # remove unnecessary convert if output schema from data source scan matches
-        # input schema for the next operator
-        if len(dataset_nodes) > 1 and dataset_nodes[0].schema.get_desc() == dataset_nodes[1].schema.get_desc():
-            dataset_nodes = [dataset_nodes[0]] + dataset_nodes[2:]
-            if len(dataset_nodes) > 1:
-                dataset_nodes[1]._source = dataset_nodes[0]
         # compute depends_on field for every node
         short_to_full_field_name = {}
         for node_idx, node in enumerate(dataset_nodes):
             # update mapping from short to full field names
             short_field_names = node.schema.field_names()
-            full_field_names = node.schema.field_names(unique=True, id=node.universal_identifier())
+            full_field_names = node.schema.field_names(unique=True, id=get_node_uid(node))
             for short_field_name, full_field_name in zip(short_field_names, full_field_names):
                 # set mapping automatically if this is a new field
                 if short_field_name not in short_to_full_field_name or (
@@ -389,7 +404,7 @@ class Optimizer:
                     short_to_full_field_name[short_field_name] = full_field_name
             # if the node is a data source, then skip
-            if isinstance(node, DataSource):
+            if isinstance(node, DataReader):
                 continue
             # If the node already has depends_on specified, then resolve each field name to a full (unique) field name
@@ -400,7 +415,7 @@ class Optimizer:
             # otherwise, make the node depend on all upstream nodes
             node._depends_on = set()
             for upstream_node in dataset_nodes[:node_idx]:
-                node._depends_on.update(upstream_node.schema.field_names(unique=True, id=upstream_node.universal_identifier()))
+                node._depends_on.update(upstream_node.schema.field_names(unique=True, id=get_node_uid(upstream_node)))
             node._depends_on = list(node._depends_on)
         # construct tree of groups

palimpzest/query/optimizer/optimizer_strategy.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from enum import Enum
 from palimpzest.policy import Policy
@@ -31,6 +32,31 @@ class OptimizationStrategy(ABC):
         """Factory method to create strategy instances"""
         return OptimizerStrategyRegistry.get_strategy(strategy_type)
+    def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
+        """
+        For each plan in `plans`, this function enforces that the input schema of every
+        operator is the output schema of the previous operator in the plan.
+        Args:
+            plans list[PhysicalPlan]: list of physical plans to normalize
+        Returns:
+            list[PhysicalPlan]: list of normalized physical plans
+        """
+        normalized_plans = []
+        for plan in plans:
+            normalized_ops = []
+            for idx, op in enumerate(plan.operators):
+                op_copy = deepcopy(op)
+                if idx == 0:
+                    normalized_ops.append(op_copy)
+                else:
+                    op_copy.input_schema = plan.operators[-1].output_schema
+                    normalized_ops.append(op_copy)
+            normalized_plans.append(PhysicalPlan(operators=normalized_ops, plan_cost=plan.plan_cost))
+        return normalized_plans
 class GreedyStrategy(OptimizationStrategy):
     def _get_greedy_physical_plan(self, groups: dict, group_id: int) -> PhysicalPlan:

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -3,8 +3,8 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from palimpzest.core.data.dataclasses import PlanCost
-from palimpzest.query.operators.datasource import DataSourcePhysicalOp
 from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.scan import ScanPhysicalOp
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -100,7 +100,7 @@ class SentinelPlan(Plan):
     def __init__(self, operator_sets: list[list[PhysicalOperator]]):
         # enforce that first operator_set is a scan and that every operator_set has at least one operator
         if len(operator_sets) > 0:
-            assert isinstance(operator_sets[0][0], DataSourcePhysicalOp), "first operator set must be a scan"
+            assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
             assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
         # store operator_sets and logical_op_ids; sort operator_sets internally by op_id

palimpzest 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

palimpzest 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl