PyPI - palimpzest - Versions diffs - 0.8.1__tar.gz → 0.8.2__tar.gz - Mend

palimpzest 0.8.1tar.gz → 0.8.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

{palimpzest-0.8.1/src/palimpzest.egg-info → palimpzest-0.8.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.8.1
+Version: 0.8.2
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org

{palimpzest-0.8.1 → palimpzest-0.8.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "0.8.1"
+version = "0.8.2"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.8"

{palimpzest-0.8.1 → palimpzest-0.8.2}/src/palimpzest/core/data/dataset.py RENAMED Viewed

@@ -595,7 +595,7 @@ class Dataset:
         return QueryProcessorFactory.create_and_run_processor(self, config)
-    def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, config: QueryProcessorConfig | None = None, **kwargs):
+    def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
         """Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
         # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
         from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory

{palimpzest-0.8.1 → palimpzest-0.8.2}/src/palimpzest/query/execution/mab_execution_strategy.py RENAMED Viewed

@@ -2,16 +2,19 @@
 import logging
 import numpy as np
+from chromadb.api.models.Collection import Collection
 from palimpzest.core.data.dataset import Dataset
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.core.models import OperatorStats, RecordOpStats, SentinelPlanStats
+from palimpzest.core.models import OperatorCostEstimates, OperatorStats, RecordOpStats, SentinelPlanStats
 from palimpzest.policy import Policy
 from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
-from palimpzest.query.operators.filter import FilterOp
+from palimpzest.query.operators.convert import LLMConvert
+from palimpzest.query.operators.filter import FilterOp, LLMFilter
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import SentinelPlan
 from palimpzest.utils.progress import create_progress_manager
@@ -55,6 +58,17 @@ class OpFrontier:
         # store the prior beliefs on operator performance (if provided)
         self.priors = priors
+        # boolean indication of the type of operator in this OpFrontier
+        sample_op = op_set[0]
+        self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
+        self.is_filter_op = isinstance(sample_op, FilterOp)
+        self.is_aggregate_op = isinstance(sample_op, AggregateOp)
+        self.is_llm_join = isinstance(sample_op, JoinOp)
+        is_llm_convert = isinstance(sample_op, LLMConvert)
+        is_llm_filter = isinstance(sample_op, LLMFilter)
+        is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
+        self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
         # get order in which we will sample physical operators for this logical operator
         sample_op_indices = self._get_op_index_order(op_set, seed)
@@ -68,13 +82,6 @@ class OpFrontier:
         self.full_op_id_to_sources_not_processed = {op.get_full_op_id(): source_indices for op in op_set}
         self.max_inputs = len(source_indices)
-        # boolean indication of the type of operator in this OpFrontier
-        sample_op = op_set[0]
-        self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
-        self.is_filter_op = isinstance(sample_op, FilterOp)
-        self.is_aggregate_op = isinstance(sample_op, AggregateOp)
-        self.is_llm_join = isinstance(sample_op, JoinOp)
         # set the initial inputs for this logical operator; we maintain a mapping from source_unique_logical_op_id --> source_indices --> input;
         # for each unique source and (tuple of) source indices, we store its output, which is an input to this operator
         # for scan operators, we use the default name "source" since these operators have no source
@@ -149,16 +156,44 @@ class OpFrontier:
         return op_id_to_pareto_distance
+    def _compute_naive_priors(self, op_set: list[PhysicalOperator]) -> dict[str, dict[str, float]]:
+        naive_priors = {}
+        for op in op_set:
+            # use naive cost estimates with dummy source estimates to compute priors
+            source_op_estimates = OperatorCostEstimates(quality=1.0, cost_per_record=0.0, time_per_record=0.0, cardinality=100)
+            op_estimates = (
+                op.naive_cost_estimates(source_op_estimates, source_op_estimates)
+                if self.is_llm_join
+                else op.naive_cost_estimates(source_op_estimates)
+            )
+            # get op_id for this operator
+            op_id = op.get_op_id()
+            # set the naive quality, cost, and time priors for this operator
+            naive_priors[op_id] = {
+                "quality": op_estimates.quality,
+                "cost": op_estimates.cost_per_record,
+                "time": op_estimates.time_per_record,
+            }
+        return naive_priors
     def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
         """
         Returns a list of indices for the operators in the op_set.
         """
-        if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
+        # if this is not an llm-operator, we simply return the indices in random order
+        if not self.is_llm_op:
             rng = np.random.default_rng(seed=seed)
             op_indices = np.arange(len(op_set))
             rng.shuffle(op_indices)
             return op_indices
+        # if this is an llm-operator, but we do not have priors, we first compute naive priors
+        if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
+            self.priors = self._compute_naive_priors(op_set)
         # NOTE: self.priors is a dictionary with format:
         # {op_id: {"quality": quality, "cost": cost, "time": time}}
@@ -215,7 +250,7 @@ class OpFrontier:
         op_source_indices_pairs = []
         # if this operator is not being optimized: we don't request inputs, but simply process what we are given / told to (in the case of scans)
-        if not self.is_llm_join and len(self.frontier_ops) == 1:
+        if not self.is_llm_op and len(self.frontier_ops) == 1:
             return [(self.frontier_ops[0], None)]
         # otherwise, sample (operator, source_indices) pairs
@@ -255,16 +290,6 @@ class OpFrontier:
                     all_inputs.extend(inputs)
             return [(op, tuple(), all_inputs)]
-        # if this is an un-optimized (non-scan, non-join) operator, flatten inputs and run on each one
-        elif not self.is_scan_op and not self.is_llm_join and len(self.frontier_ops) == 1:
-            op_inputs = []
-            op = self.frontier_ops[0]
-            for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
-                for source_indices, inputs in source_indices_to_inputs.items():
-                    for input in inputs:
-                        op_inputs.append((op, source_indices, input))
-            return op_inputs
         ### for optimized operators
         # get the list of (op, source_indices) pairs which this operator needs to execute
         op_source_indices_pairs = self._get_op_source_indices_pairs()

{palimpzest-0.8.1 → palimpzest-0.8.2}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -40,8 +40,8 @@ class QueryProcessorConfig(BaseModel):
     use_final_op_quality: bool = Field(default=False)
     # sentinel optimization flags
-    k: int = Field(default=5)
-    j: int = Field(default=5)
+    k: int = Field(default=6)
+    j: int = Field(default=4)
     sample_budget: int = Field(default=100)
     seed: int = Field(default=42)
     exp_name: str | None = Field(default=None)

{palimpzest-0.8.1 → palimpzest-0.8.2}/src/palimpzest/query/processor/query_processor.py RENAMED Viewed

@@ -114,8 +114,8 @@ class QueryProcessor:
         execution_stats = ExecutionStats(execution_id=self.execution_id())
         execution_stats.start()
-        # if the user provides a train_dataset or validator, we perform optimization
-        if self.train_dataset is not None or self.validator is not None:
+        # if the user provides a validator, we perform optimization
+        if self.validator is not None:
             # create sentinel plan
             sentinel_plan = self._create_sentinel_plan(self.train_dataset)

{palimpzest-0.8.1 → palimpzest-0.8.2}/src/palimpzest/query/processor/query_processor_factory.py RENAMED Viewed

@@ -62,13 +62,17 @@ class QueryProcessorFactory:
             print("WARNING: Both `progress` and `verbose` are set to True, but only one can be True at a time; defaulting to `progress=True`")
             config.verbose = False
+        # if the user provides a training dataset, but no validator, create a default validator
+        if train_dataset is not None and validator is None:
+            validator = Validator()
+            logger.info("No validator provided; using default Validator")
         # boolean flag for whether we're performing optimization or not
-        optimization = train_dataset is not None or validator is not None
-        val_based_opt = train_dataset is None and validator is not None
+        optimization = validator is not None
         # handle "auto" default for sentinel execution strategies
         if config.sentinel_execution_strategy == "auto":
-            config.sentinel_execution_strategy = ("validator" if val_based_opt else "mab") if optimization else None
+            config.sentinel_execution_strategy = "mab" if optimization else None
         # convert the config values for processing, execution, and optimization strategies to enums
         config = cls._normalize_strategies(config)
@@ -87,7 +91,7 @@ class QueryProcessorFactory:
         # set the final set of available models in the config
         config.available_models = available_models
-        return config
+        return config, validator
     @classmethod
     def _create_optimizer(cls, config: QueryProcessorConfig) -> Optimizer:
@@ -143,7 +147,7 @@ class QueryProcessorFactory:
             config = QueryProcessorConfig()
         # apply any additional keyword arguments to the config and validate its contents
-        config = cls._config_validation_and_normalization(config, train_dataset, validator)
+        config, validator = cls._config_validation_and_normalization(config, train_dataset, validator)
         # create the optimizer, execution strateg(ies), and processor
         optimizer = cls._create_optimizer(config)

{palimpzest-0.8.1 → palimpzest-0.8.2/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.8.1
+Version: 0.8.2
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org