PyPI - palimpzest - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.3.dist-info/RECORD +0 -87
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/query/processor/query_processor.py CHANGED Viewed

@@ -1,19 +1,17 @@
+import logging
 from abc import abstractmethod
-from concurrent.futures import ThreadPoolExecutor
-from palimpzest.core.data.dataclasses import PlanStats, RecordOpStats
+from palimpzest.core.data.dataclasses import PlanStats
 from palimpzest.core.data.datareaders import DataReader
 from palimpzest.core.elements.records import DataRecord, DataRecordCollection
 from palimpzest.policy import Policy
-from palimpzest.query.optimizer.cost_model import CostModel
+from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
 from palimpzest.query.optimizer.optimizer import Optimizer
-from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
-from palimpzest.query.optimizer.plan import PhysicalPlan
-from palimpzest.query.processor.config import QueryProcessorConfig
-from palimpzest.sets import Dataset, Set
+from palimpzest.sets import Dataset
 from palimpzest.utils.hash_helpers import hash_for_id
 from palimpzest.utils.model_helpers import get_models
+logger = logging.getLogger(__name__)
 class QueryProcessor:
     """
@@ -25,9 +23,18 @@ class QueryProcessor:
     def __init__(
         self,
         dataset: Dataset,
-        optimizer: Optimizer = None,
-        config: QueryProcessorConfig = None,
-        *args,
+        optimizer: Optimizer,
+        execution_strategy: ExecutionStrategy,
+        sentinel_execution_strategy: SentinelExecutionStrategy | None,
+        num_samples: int | None = None,
+        val_datasource: DataReader | None = None,
+        scan_start_idx: int = 0,
+        cache: bool = False,
+        verbose: bool = False,
+        progress: bool = True,
+        max_workers: int | None = None,
+        policy: Policy | None = None,
+        available_models: list[str] | None = None,
         **kwargs,
     ):
         """
@@ -35,48 +42,32 @@ class QueryProcessor:
         Args:
             dataset: Dataset to process
-            optimizer: Custom optimizer (optional)
-            execution_engine: Custom execution engine (optional)
-            config: Configuration dictionary for default components
+            TODO
         """
-        assert config is not None, "QueryProcessorConfig is required for QueryProcessor"
-        self.config = config or QueryProcessorConfig()
         self.dataset = dataset
-        self.datareader = self._get_datareader(self.dataset)
-        self.num_samples = self.config.num_samples
-        self.val_datasource = self.config.val_datasource
-        self.scan_start_idx = self.config.scan_start_idx
-        self.nocache = self.config.nocache
-        self.verbose = self.config.verbose
-        self.max_workers = self.config.max_workers
-        self.num_workers_per_plan = self.config.num_workers_per_plan
-        self.min_plans = self.config.min_plans
+        self.optimizer = optimizer
+        self.execution_strategy = execution_strategy
+        self.sentinel_execution_strategy = sentinel_execution_strategy
-        self.policy = self.config.policy
+        self.num_samples = num_samples
+        self.val_datasource = val_datasource
+        self.scan_start_idx = scan_start_idx
+        self.cache = cache
+        self.verbose = verbose
+        self.progress = progress
+        self.max_workers = max_workers
-        self.available_models = self.config.available_models
+        self.policy = policy
+        self.available_models = available_models
         if self.available_models is None or len(self.available_models) == 0:
             self.available_models = get_models(include_vision=True)
         if self.verbose:
             print("Available models: ", self.available_models)
-        # Initialize optimizer and execution engine
-        # TODO: config currently has optimizer field which is string.
-        # In this case, we only use the initialized optimizer. Later after we split the config to multiple configs, there won't be such confusion.
-        assert optimizer is not None, "Optimizer is required. Please use QueryProcessorFactory.create_processor() to initialize a QueryProcessor."
-        self.optimizer = optimizer
-    def _get_datareader(self, dataset: Set | DataReader) -> DataReader:
-        """
-        Gets the DataReader for the given dataset.
-        """
-        # iterate until we reach DataReader
-        while isinstance(dataset, Set):
-            dataset = dataset._source
-        return dataset
+        logger.info(f"Initialized QueryProcessor {self.__class__.__name__}")
+        logger.debug(f"QueryProcessor initialized with config: {self.__dict__}")
     def execution_id(self) -> str:
         """
@@ -89,177 +80,18 @@ class QueryProcessor:
         return hash_for_id(id_str)
-    def get_max_quality_plan_id(self, plans: list[PhysicalPlan]) -> str:
-        """
-        Return the plan_id for the plan with the highest quality in the list of plans.
-        """
-        max_quality_plan_id, max_quality = None, -1
-        for plan in plans:
-            if plan.quality > max_quality or max_quality_plan_id is None:
-                max_quality_plan_id = plan.plan_id
-                max_quality = plan.quality
-        return max_quality_plan_id
-    def aggregate_plan_stats(self, plan_stats: list[PlanStats]) -> dict[str, PlanStats]:
-        """
-        Aggregate a list of plan stats into a dictionary mapping plan_id --> cumulative plan stats.
-        NOTE: we make the assumption that the same plan cannot be run more than once in parallel,
-        i.e. each plan stats object for an individual plan comes from two different (sequential)
-        periods in time. Thus, PlanStats' total_plan_time(s) can be summed.
-        """
-        agg_plan_stats = {}
-        for ps in plan_stats:
-            if ps.plan_id in agg_plan_stats:
-                agg_plan_stats[ps.plan_id] += ps
-            else:
-                agg_plan_stats[ps.plan_id] = ps
-        return agg_plan_stats
-    def execute_plans(
-        self, plans: list[PhysicalPlan], max_quality_plan_id: str, num_samples: int | float = float("inf")
-    ):
-        """
-        Execute a given list of plans for num_samples records each. Plans are executed in parallel.
-        If any workers are unused, then additional workers are distributed evenly among plans.
-        """
-        # compute number of plans
-        num_plans = len(plans)
-        # set plan_parallel_workers and workers_per_plan;
-        # - plan_parallel_workers controls how many plans are executed in parallel
-        # - workers_per_plan controls how many threads are assigned to executing each plan
-        plan_parallel_workers, workers_per_plan = None, None
-        if self.max_workers <= num_plans:
-            plan_parallel_workers = self.max_workers
-            workers_per_plan = [1 for _ in range(num_plans)]
-        else:
-            plan_parallel_workers = num_plans
-            workers_per_plan = [(self.max_workers // num_plans) for _ in range(num_plans)]
-            idx = 0
-            while sum(workers_per_plan) < self.max_workers:
-                workers_per_plan[idx] += 1
-                idx += 1
-        with ThreadPoolExecutor(max_workers=plan_parallel_workers) as executor:
-            results = list(executor.map(lambda x: self.execute_plan(**x),
-                    [{"plan": plan,
-                      "num_samples": num_samples,
-                      "plan_workers": plan_workers}
-                      for plan, plan_workers in zip(plans, workers_per_plan)],
-                )
-            )
-        # results = list(map(lambda x: self.execute_plan(**x),
-        #         [{"plan": plan,
-        #             "num_samples": num_samples,
-        #             "plan_workers": 1}
-        #             for plan in plans],
-        #     )
-        # )
-        # split results into per-plan records and plan stats
-        all_records, all_plan_stats = zip(*results)
-        # process results to get sample execution data and sentinel plan stats
-        all_sample_execution_data, return_records = [], []
-        for records, plan_stats, plan in zip(all_records, all_plan_stats, plans):
-            # aggregate sentinel est. data
-            for operator_stats in plan_stats.operator_stats.values():
-                all_sample_execution_data.extend(operator_stats.record_op_stats_lst)
-            # if this is the max quality plan for this set of plans, return its results for these records
-            if plan.plan_id == max_quality_plan_id:
-                return_records = records
-        return all_sample_execution_data, return_records, all_plan_stats
-    def _execute_best_plan(
-        self,
-        dataset: Dataset,
-        policy: Policy,
-        optimizer: Optimizer,
-        execution_data: list[RecordOpStats] | None = None,
-    ) -> tuple[list[DataRecord], list[PlanStats]]:
+    def _execute_best_plan(self, dataset: Dataset, optimizer: Optimizer) -> tuple[list[DataRecord], list[PlanStats]]:
         # get the optimal plan according to the optimizer
-        plans = optimizer.optimize(dataset, policy)
+        plans = optimizer.optimize(dataset)
         final_plan = plans[0]
         # execute the plan
-        # TODO: for some reason this is not picking up change to self.max_workers from PipelinedParallelPlanExecutor.__init__()
-        records, plan_stats = self.execute_plan(
-            plan=final_plan,
-            plan_workers=self.max_workers,
-        )
+        records, plan_stats = self.execution_strategy.execute_plan(plan=final_plan)
         # return the output records and plan stats
         return records, [plan_stats]
-    def _execute_with_strategy(
-        self,
-        dataset: Dataset,
-        policy: Policy,
-        optimizer: Optimizer,
-        execution_data: list[RecordOpStats] | None = None,
-    ) -> tuple[list[DataRecord], list[PlanStats]]:
-        records, plan_stats = [], []
-        if optimizer.optimization_strategy_type == OptimizationStrategyType.CONFIDENCE_INTERVAL:
-            records, plan_stats = self._execute_confidence_interval_strategy(dataset, policy, optimizer, execution_data)
-        else:
-            records, plan_stats = self._execute_best_plan(dataset, policy, optimizer, execution_data)
-        return records, plan_stats
-    def _execute_confidence_interval_strategy(
-        self,
-        dataset: Dataset,
-        policy: Policy,
-        optimizer: Optimizer,
-        execution_data: list[RecordOpStats] | None = None,
-    ) -> tuple[list[DataRecord], list[PlanStats]]:
-        # initialize output records and plan stats
-        if execution_data is None:
-            execution_data = []
-        records, plan_stats = [], []
-        # get the initial set of optimal plans according to the optimizer
-        plans = optimizer.optimize(dataset, policy)
-        while len(plans) > 1 and self.scan_start_idx < len(self.datareader):
-            # identify the plan with the highest quality in the set
-            max_quality_plan_id = self.get_max_quality_plan_id(plans)
-            # execute the set of plans for a fixed number of samples
-            new_execution_data, new_records, new_plan_stats = self.execute_plans(
-                list(plans), max_quality_plan_id, self.num_samples
-            )
-            records.extend(new_records)
-            plan_stats.extend(new_plan_stats)
-            if self.scan_start_idx + self.num_samples < len(self.datareader):
-                # update cost model and optimizer
-                execution_data.extend(new_execution_data)
-                cost_model = CostModel(sample_execution_data=execution_data)
-                optimizer.update_cost_model(cost_model)
-                # get new set of plans
-                plans = optimizer.optimize(dataset, policy)
-                # update scan start idx
-                self.scan_start_idx += self.num_samples
-        if self.scan_start_idx < len(self.datareader):
-            # execute final plan until end
-            final_plan = plans[0]
-            new_records, new_plan_stats = self.execute_plan(
-                plan=final_plan,
-                plan_workers=self.max_workers,
-            )
-            records.extend(new_records)
-            plan_stats.append(new_plan_stats)
-        # return the final set of records and plan stats
-        return records, plan_stats
     # TODO: consider to support dry_run.
     @abstractmethod
     def execute(self) -> DataRecordCollection:
-        raise NotImplementedError("Abstract method to be overwritten by sub-classes")
+        raise NotImplementedError("Abstract method to be overwritten by sub-classes")

palimpzest/query/processor/query_processor_factory.py CHANGED Viewed

@@ -1,69 +1,123 @@
+import logging
 from enum import Enum
 from palimpzest.core.elements.records import DataRecordCollection
-from palimpzest.query.execution.execution_strategy import ExecutionStrategyType
+from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
+from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType, SentinelExecutionStrategyType
 from palimpzest.query.optimizer.cost_model import CostModel
 from palimpzest.query.optimizer.optimizer import Optimizer
-from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
+from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
 from palimpzest.query.processor.config import QueryProcessorConfig
-from palimpzest.query.processor.mab_sentinel_processor import (
-    MABSentinelPipelinedParallelProcessor,
-    MABSentinelSequentialSingleThreadProcessor,
-)
-from palimpzest.query.processor.nosentinel_processor import (
-    NoSentinelPipelinedParallelProcessor,
-    NoSentinelPipelinedSingleThreadProcessor,
-    NoSentinelSequentialSingleThreadProcessor,
-)
+from palimpzest.query.processor.processing_strategy_type import ProcessingStrategyType
 from palimpzest.query.processor.query_processor import QueryProcessor
-from palimpzest.query.processor.random_sampling_sentinel_processor import (
-    RandomSamplingSentinelPipelinedParallelProcessor,
-    RandomSamplingSentinelSequentialSingleThreadProcessor,
-)
-from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
 from palimpzest.sets import Dataset, Set
 from palimpzest.utils.model_helpers import get_models
+logger = logging.getLogger(__name__)
-class ProcessingStrategyType(Enum):
-    """How to generate and optimize query plans"""
-    MAB_SENTINEL = "mab_sentinel"
-    NO_SENTINEL = "nosentinel"
-    RANDOM_SAMPLING = "random_sampling"
-    STREAMING = "streaming"
-    AUTO = "auto"
-def convert_to_enum(enum_type: type[Enum], value: str) -> Enum:
-    if value == "pipelined":
-        value = "pipelined_single_thread"
-    value = value.upper().replace('-', '_')
-    try:
-        return enum_type[value]
-    except KeyError as e:
-        raise ValueError(f"Unsupported {enum_type.__name__}: {value}") from e
+class QueryProcessorFactory:
+    @classmethod
+    def _convert_to_enum(cls, enum_type: type[Enum], value: str) -> Enum:
+        value = value.upper().replace('-', '_')
+        try:
+            return enum_type[value]
+        except KeyError as e:
+            raise ValueError(f"Unsupported {enum_type.__name__}: {value}") from e
-class QueryProcessorFactory:
-    PROCESSOR_MAPPING = {
-        (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.SEQUENTIAL):
-            NoSentinelSequentialSingleThreadProcessor,
-        (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_SINGLE_THREAD):
-            NoSentinelPipelinedSingleThreadProcessor,
-        (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL):
-            NoSentinelPipelinedParallelProcessor,
-        (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.SEQUENTIAL):
-            MABSentinelSequentialSingleThreadProcessor,
-        (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL):
-            MABSentinelPipelinedParallelProcessor,
-        (ProcessingStrategyType.STREAMING, ExecutionStrategyType.SEQUENTIAL):
-            StreamingQueryProcessor,
-        (ProcessingStrategyType.STREAMING, ExecutionStrategyType.PIPELINED_PARALLEL):
-            StreamingQueryProcessor,
-        (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.SEQUENTIAL):
-            RandomSamplingSentinelSequentialSingleThreadProcessor,
-        (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.PIPELINED_PARALLEL):
-            RandomSamplingSentinelPipelinedParallelProcessor,
-    }
+    @classmethod
+    def _normalize_strategies(cls, config: QueryProcessorConfig):
+        """
+        Convert the string representation of each strategy into its Enum equivalent and throw
+        an exception if the conversion fails.
+        """
+        strategy_types = {
+            "processing_strategy": ProcessingStrategyType,
+            "execution_strategy": ExecutionStrategyType,
+            "sentinel_execution_strategy": SentinelExecutionStrategyType,
+            "optimizer_strategy": OptimizationStrategyType,
+        }
+        for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
+            strategy_str = getattr(config, strategy)
+            strategy_type = strategy_types[strategy]
+            strategy_enum = None
+            if strategy_str is not None:
+                try:
+                    strategy_enum = cls._convert_to_enum(strategy_type, strategy_str)
+                except ValueError as e:
+                    raise ValueError(f"""Unsupported {strategy}: {strategy_str}.
+                                        The supported strategies are: {strategy_type.__members__.keys()}""") from e
+            setattr(config, strategy, strategy_enum)
+            logger.debug(f"Normalized {strategy}: {strategy_enum}")
+        return config
+    @classmethod
+    def _config_validation_and_normalization(cls, config: QueryProcessorConfig):
+        if config.policy is None:
+            raise ValueError("Policy is required for optimizer")
+        if config.cache:
+            raise ValueError("cache=True is not supported yet")
+        # only one of progress or verbose can be set; we will default to progress=True
+        if config.progress and config.verbose:
+            print("WARNING: Both `progress` and `verbose` are set to True, but only one can be True at a time; defaulting to `progress=True`")
+            config.verbose = False
+        # handle "auto" defaults for processing and sentinel execution strategies
+        if config.processing_strategy == "auto":
+            config.processing_strategy = "no_sentinel" if config.val_datasource is None else "sentinel"
+        if config.sentinel_execution_strategy == "auto":
+            config.sentinel_execution_strategy = None if config.val_datasource is None else "mab"
+        # convert the config values for processing, execution, and optimization strategies to enums
+        config = cls._normalize_strategies(config)
+        # check that processor uses a supported execution strategy
+        if config.execution_strategy not in config.processing_strategy.valid_execution_strategies():
+            raise ValueError(f"Unsupported `execution_strategy` {config.execution_strategy} for `processing_strategy` {config.processing_strategy}.")
+        # check that validation data is provided for sentinel execution
+        if config.val_datasource is None and config.processing_strategy.is_sentinel_strategy():
+            raise ValueError("`val_datasource` is required for SENTINEL processing strategies")
+        # check that sentinel execution is provided for sentinel processor
+        if config.sentinel_execution_strategy is None and config.processing_strategy.is_sentinel_strategy():
+            raise ValueError("`sentinel_execution_strategy` is required for SENTINEL processing strategies")
+        # get available models
+        available_models = getattr(config, 'available_models', [])
+        if available_models is None or len(available_models) == 0:
+            available_models = get_models(include_vision=True)
+        config.available_models = available_models
+        return config
+    @classmethod
+    def _create_optimizer(cls, config: QueryProcessorConfig) -> Optimizer:
+        return Optimizer(cost_model=CostModel(), **config.to_dict())
+    @classmethod
+    def _create_execution_strategy(cls, config: QueryProcessorConfig) -> ExecutionStrategy:
+        """
+        Creates an execution strategy based on the configuration.
+        """
+        execution_strategy_cls = config.execution_strategy.value
+        return execution_strategy_cls(**config.to_dict())
+    @classmethod
+    def _create_sentinel_execution_strategy(cls, config: QueryProcessorConfig) -> SentinelExecutionStrategy:
+        """
+        Creates an execution strategy based on the configuration.
+        """
+        if config.sentinel_execution_strategy is None:
+            return None
+        sentinel_execution_strategy_cls = config.sentinel_execution_strategy.value
+        return sentinel_execution_strategy_cls(**config.to_dict())
     @classmethod
     def create_processor(
@@ -83,91 +137,24 @@ class QueryProcessorFactory:
         if config is None:
             config = QueryProcessorConfig()
-        # apply any additional keyword arguments to the config
+        # apply any additional keyword arguments to the config and validate its contents
         config.update(**kwargs)
         config = cls._config_validation_and_normalization(config)
-        processing_strategy, execution_strategy, optimizer_strategy = cls._normalize_strategies(config)
-        optimizer = cls._create_optimizer(optimizer_strategy, config)
-        processor_key = (processing_strategy, execution_strategy)
-        processor_cls = cls.PROCESSOR_MAPPING.get(processor_key)
+        # create the optimizer, execution strateg(ies), and processor
+        optimizer = cls._create_optimizer(config)
+        config.execution_strategy = cls._create_execution_strategy(config)
+        config.sentinel_execution_strategy = cls._create_sentinel_execution_strategy(config)
+        processor_cls = config.processing_strategy.value
+        processor = processor_cls(dataset, optimizer, **config.to_dict())
-        if processor_cls is None:
-            raise ValueError(f"Unsupported combination of processing strategy {processing_strategy} "
-                        f"and execution strategy {execution_strategy}")
-        return processor_cls(dataset=dataset, optimizer=optimizer, config=config, **kwargs)
+        return processor
     @classmethod
     def create_and_run_processor(cls, dataset: Dataset, config: QueryProcessorConfig | None = None, **kwargs) -> DataRecordCollection:
         # TODO(Jun): Consider to use cache here.
+        logger.info(f"Creating processor for dataset: {dataset}")
         processor = cls.create_processor(dataset=dataset, config=config, **kwargs)
-        return processor.execute()
+        logger.info(f"Created processor: {processor}")
-    #TODO(Jun): The all avaliable plans could be generated earlier and outside Optimizer.
-    @classmethod
-    def _create_optimizer(cls, optimizer_strategy: OptimizationStrategyType, config: QueryProcessorConfig) -> Optimizer:
-        available_models = getattr(config, 'available_models', []) or get_models(include_vision=True)
-        if config.policy is None:
-            raise ValueError("Policy is required for optimizer")
-        return Optimizer(
-            policy=config.policy,
-            cost_model=CostModel(),
-            no_cache=config.nocache,
-            verbose=config.verbose,
-            available_models=available_models,
-            allow_bonded_query=config.allow_bonded_query,
-            allow_conventional_query=config.allow_conventional_query,
-            allow_code_synth=config.allow_code_synth,
-            allow_token_reduction=config.allow_token_reduction,
-            allow_rag_reduction=config.allow_rag_reduction,
-            allow_mixtures=config.allow_mixtures,
-            allow_critic=config.allow_critic,
-            optimization_strategy_type=optimizer_strategy,
-            use_final_op_quality=config.use_final_op_quality
-        )
-    @classmethod
-    def _normalize_strategies(cls, config: QueryProcessorConfig):
-        processing_strategy, execution_strategy, optimizer_strategy = config.processing_strategy, config.execution_strategy, config.optimizer_strategy
-        if isinstance(processing_strategy, str):
-            try:
-                processing_strategy = convert_to_enum(ProcessingStrategyType, processing_strategy)
-            except ValueError as e:
-                raise ValueError(f"""Unsupported processing strategy: {processing_strategy}.
-                                    The supported strategies are: {ProcessingStrategyType.__members__.keys()}""") from e
-        if isinstance(execution_strategy, str):
-            try:
-                execution_strategy = convert_to_enum(ExecutionStrategyType, execution_strategy)
-            except ValueError as e:
-                raise ValueError(f"""Unsupported execution strategy: {execution_strategy}.
-                                    The supported strategies are: {ExecutionStrategyType.__members__.keys()}""") from e
-        if isinstance(optimizer_strategy, str):
-            try:
-                optimizer_strategy = convert_to_enum(OptimizationStrategyType, optimizer_strategy)
-            except ValueError as e:
-                raise ValueError(f"""Unsupported optimizer strategy: {optimizer_strategy}.
-                                    The supported strategies are: {OptimizationStrategyType.__members__.keys()}""") from e
-        return processing_strategy, execution_strategy, optimizer_strategy
-    @classmethod
-    def _config_validation_and_normalization(cls, config: QueryProcessorConfig):
-        if config.policy is None:
-            raise ValueError("Policy is required for optimizer")
-        if not config.nocache:
-            raise ValueError("nocache=False is not supported yet")
-        if config.val_datasource is None and config.processing_strategy in [ProcessingStrategyType.MAB_SENTINEL, ProcessingStrategyType.RANDOM_SAMPLING]:
-            raise ValueError("val_datasource is required for MAB_SENTINEL and RANDOM_SAMPLING processing strategies")
-        available_models = getattr(config, 'available_models', [])
-        if available_models is None or len(available_models) == 0:
-            available_models = get_models(include_vision=True)
-        config.available_models = available_models
-        return config
+        return processor.execute()

palimpzest/query/processor/sentinel_processor.py ADDED Viewed

@@ -0,0 +1,90 @@
+import logging
+from palimpzest.core.data.dataclasses import ExecutionStats, SentinelPlanStats
+from palimpzest.core.elements.records import DataRecordCollection
+from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
+from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
+from palimpzest.query.optimizer.plan import SentinelPlan
+from palimpzest.query.processor.query_processor import QueryProcessor
+logger = logging.getLogger(__name__)
+class SentinelQueryProcessor(QueryProcessor):
+    def _generate_sample_observations(self, sentinel_plan: SentinelPlan) -> SentinelPlanStats:
+        """
+        This function is responsible for generating sample observation data which can be
+        consumed by the CostModel.
+        To accomplish this, we construct a special sentinel plan using the Optimizer which is
+        capable of executing any valid physical implementation of a Filter or Convert operator
+        on each record.
+        """
+        # if we're using validation data, get the set of expected output records
+        expected_outputs = {}
+        for source_idx in range(len(self.val_datasource)):
+            expected_output = self.val_datasource[source_idx]
+            expected_outputs[source_idx] = expected_output
+        # execute sentinel plan; returns sentinel_plan_stats
+        return self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, expected_outputs)
+    def _create_sentinel_plan(self) -> SentinelPlan:
+        """
+        Generates and returns a SentinelPlan for the given dataset.
+        """
+        # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
+        # create a new optimizer and update its strategy to SENTINEL
+        optimizer = self.optimizer.deepcopy_clean()
+        optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
+        # create copy of dataset, but change its data source to the validation data source
+        dataset = self.dataset.copy()
+        dataset._set_data_source(self.val_datasource)
+        # get the sentinel plan for the given dataset
+        sentinel_plans = optimizer.optimize(dataset)
+        sentinel_plan = sentinel_plans[0]
+        return sentinel_plan
+    def execute(self) -> DataRecordCollection:
+        # for now, enforce that we are using validation data; we can relax this after paper submission
+        if self.val_datasource is None:
+            raise Exception("Make sure you are using validation data with SentinelQueryProcessor")
+        logger.info(f"Executing {self.__class__.__name__}")
+        # create execution stats
+        execution_stats = ExecutionStats(execution_id=self.execution_id())
+        execution_stats.start()
+        # create sentinel plan
+        sentinel_plan = self._create_sentinel_plan()
+        # generate sample execution data
+        sentinel_plan_stats = self._generate_sample_observations(sentinel_plan)
+        # update the execution stats to account for the work done in optimization
+        execution_stats.add_plan_stats(sentinel_plan_stats)
+        execution_stats.finish_optimization()
+        # (re-)initialize the optimizer
+        optimizer = self.optimizer.deepcopy_clean()
+        # construct the CostModel with any sample execution data we've gathered
+        cost_model = SampleBasedCostModel(sentinel_plan_stats, self.verbose)
+        optimizer.update_cost_model(cost_model)
+        # execute plan(s) according to the optimization strategy
+        records, plan_stats = self._execute_best_plan(self.dataset, optimizer)
+        # update the execution stats to account for the work to execute the final plan
+        execution_stats.add_plan_stats(plan_stats)
+        execution_stats.finish()
+        # construct and return the DataRecordCollection
+        result = DataRecordCollection(records, execution_stats=execution_stats)
+        logger.info("Done executing SentinelQueryProcessor")
+        return result

palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl