PyPI - palimpzest - Versions diffs - 1.0.0__tar.gz → 1.1.0__tar.gz - Mend

palimpzest 1.0.0tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{palimpzest-1.0.0/src/palimpzest.egg-info → palimpzest-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.0.0
+Version: 1.1.0
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org

{palimpzest-1.0.0 → palimpzest-1.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "1.0.0"
+version = "1.1.0"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.12"

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/core/elements/groupbysig.py RENAMED Viewed

@@ -11,6 +11,11 @@ from palimpzest.core.lib.schemas import create_schema_from_fields
 # - construct the correct output schema using the input schema and the group by and aggregation fields
 # - remove/update all other references to GroupBySig in the codebase
+# TODO:
+# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
+# - construct the correct output schema using the input schema and the group by and aggregation fields
+# - remove/update all other references to GroupBySig in the codebase
 # signature for a group by aggregate that applies
 # group and aggregation to an input tuple
 class GroupBySig:

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/core/models.py RENAMED Viewed

@@ -454,6 +454,12 @@ class BasePlanStats(BaseModel):
         """
         return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
+    def get_total_cost_so_far(self) -> float:
+        """
+        Get the total cost incurred so far in this plan execution.
+        """
+        return self.sum_op_costs() + self.sum_validation_costs()
 class PlanStats(BasePlanStats):
     """

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/query/execution/execution_strategy.py RENAMED Viewed

@@ -82,10 +82,11 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
     """
     def __init__(
         self,
-        k: int,
-        j: int,
-        sample_budget: int,
         policy: Policy,
+        k: int = 6,
+        j: int = 4,
+        sample_budget: int = 100,
+        sample_cost_budget: float | None = None,
         priors: dict | None = None,
         use_final_op_quality: bool = False,
         seed: int = 42,
@@ -97,6 +98,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.k = k
         self.j = j
         self.sample_budget = sample_budget
+        self.sample_cost_budget = sample_cost_budget
         self.policy = policy
         self.priors = priors
         self.use_final_op_quality = use_final_op_quality

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/query/execution/mab_execution_strategy.py RENAMED Viewed

@@ -680,6 +680,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
         return max_quality_op
+    def _compute_termination_condition(self, samples_drawn: int, sampling_cost: float) -> bool:
+        return (samples_drawn >= self.sample_budget) if self.sample_cost_budget is None else (sampling_cost >= self.sample_cost_budget)
     def _execute_sentinel_plan(
             self,
             plan: SentinelPlan,
@@ -688,8 +691,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
             plan_stats: SentinelPlanStats,
         ) -> SentinelPlanStats:
         # sample records and operators and update the frontiers
-        samples_drawn = 0
-        while samples_drawn < self.sample_budget:
+        samples_drawn, sampling_cost = 0, 0.0
+        while not self._compute_termination_condition(samples_drawn, sampling_cost):
             # pre-compute the set of source indices which will need to be sampled
             source_indices_to_sample = set()
             for op_frontier in op_frontiers.values():
@@ -732,6 +735,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 }
                 source_indices_to_all_record_sets, val_gen_stats = self._score_quality(validator, source_indices_to_all_record_sets)
+                # update the progress manager with validation cost
+                self.progress_manager.incr_overall_progress_cost(val_gen_stats.cost_per_record)
                 # remove records that were read from the execution cache before adding to record op stats
                 new_record_op_stats = []
                 for _, record_set_tuples in source_indices_to_record_set_tuples.items():
@@ -742,6 +748,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 # update plan stats
                 plan_stats.add_record_op_stats(unique_logical_op_id, new_record_op_stats)
                 plan_stats.add_validation_gen_stats(unique_logical_op_id, val_gen_stats)
+                sampling_cost = plan_stats.get_total_cost_so_far()
                 # provide the best record sets as inputs to the next logical operator
                 next_unique_logical_op_id = plan.get_next_unique_logical_op_id(unique_logical_op_id)
@@ -813,7 +820,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
         # initialize and start the progress manager
-        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, progress=self.progress)
+        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, sample_cost_budget=self.sample_cost_budget, progress=self.progress)
         self.progress_manager.start()
         # NOTE: we must handle progress manager outside of _execute_sentinel_plan to ensure that it is shut down correctly;

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/query/generators/generators.py RENAMED Viewed

@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
                     reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
                     completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
             if self.model.is_vllm_model():
-                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
+                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key"), **completion_kwargs}
             completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
             end_time = time.time()
             logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/query/operators/rag.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import time
+from typing import Any
 from numpy import dot
 from numpy.linalg import norm
@@ -153,8 +154,8 @@ class RAGConvert(LLMConvert):
             field = candidate.get_field_type(field_name)
             # skip this field if it is not a string or a list of strings
-            is_string_field = field.annotation in [str, str | None]
-            is_list_string_field = field.annotation in [list[str], list[str] | None]
+            is_string_field = field.annotation in [str, str | None, str | Any]
+            is_list_string_field = field.annotation in [list[str], list[str] | None, list[str] | Any]
             if not (is_string_field or is_list_string_field):
                 continue
@@ -358,8 +359,8 @@ class RAGFilter(LLMFilter):
             field = candidate.get_field_type(field_name)
             # skip this field if it is not a string or a list of strings
-            is_string_field = field.annotation in [str, str | None]
-            is_list_string_field = field.annotation in [list[str], list[str] | None]
+            is_string_field = field.annotation in [str, str | None, str | Any]
+            is_list_string_field = field.annotation in [list[str], list[str] | None, list[str] | Any]
             if not (is_string_field or is_list_string_field):
                 continue

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -44,6 +44,7 @@ class QueryProcessorConfig(BaseModel):
     k: int = Field(default=6)
     j: int = Field(default=4)
     sample_budget: int = Field(default=100)
+    sample_cost_budget: float | None = Field(default=None)
     seed: int = Field(default=42)
     exp_name: str | None = Field(default=None)
     priors: dict | None = Field(default=None)

{palimpzest-1.0.0 → palimpzest-1.1.0}/src/palimpzest/utils/progress.py RENAMED Viewed

@@ -283,7 +283,7 @@ class PZProgressManager(ProgressManager):
         self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
 class PZSentinelProgressManager(ProgressManager):
-    def __init__(self, plan: SentinelPlan, sample_budget: int):
+    def __init__(self, plan: SentinelPlan, sample_budget: int | None, sample_cost_budget: float | None):
         # overall progress bar
         self.overall_progress = RichProgress(
             SpinnerColumn(),
@@ -298,7 +298,9 @@ class PZSentinelProgressManager(ProgressManager):
             refresh_per_second=10,
             expand=True,   # Use full width
         )
-        self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
+        self.use_cost_budget = sample_cost_budget is not None
+        total = sample_cost_budget if self.use_cost_budget else sample_budget
+        self.overall_task_id = self.overall_progress.add_task("", total=total, cost=0.0, recent="")
         # logical operator progress bars
         self.op_progress = RichProgress(
@@ -334,6 +336,9 @@ class PZSentinelProgressManager(ProgressManager):
         # initialize start time
         self.start_time = None
+        # initialize validation cost
+        self.validation_cost = 0.0
         # add a task to the progress manager for each operator in the plan
         for topo_idx, (logical_op_id, op_set) in enumerate(plan):
             unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
@@ -387,15 +392,34 @@ class PZSentinelProgressManager(ProgressManager):
         # start progress bars
         self.live_display.start()
+    def incr_overall_progress_cost(self, cost_delta: float):
+        """Advance the overall progress bar by the given cost delta"""
+        self.validation_cost += cost_delta
+        self.overall_progress.update(
+            self.overall_task_id,
+            advance=cost_delta,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
+            refresh=True,
+        )
+        # force the live display to refresh
+        self.live_display.refresh()
     def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
         # TODO: (above) organize progress bars into a Live / Table / Panel or something
         # get the task for the given operation
         task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
+        # store the cost before updating stats
+        previous_total_cost = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost
         # update statistics with any additional keyword arguments
         if kwargs != {}:
             self.update_stats(unique_logical_op_id, **kwargs)
+        # compute the cost delta
+        cost_delta = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost - previous_total_cost
         # update progress bar and recent text in one update
         if display_text is not None:
             self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
@@ -414,10 +438,11 @@ class PZSentinelProgressManager(ProgressManager):
         )
         # advance the overall progress bar
+        advance = cost_delta if self.use_cost_budget else num_samples
         self.overall_progress.update(
             self.overall_task_id,
-            advance=num_samples,
-            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
+            advance=advance,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
             refresh=True,
         )
@@ -451,6 +476,7 @@ def create_progress_manager(
     plan: PhysicalPlan | SentinelPlan,
     num_samples: int | None = None,
     sample_budget: int | None = None,
+    sample_cost_budget: float | None = None,
     progress: bool = True,
 ) -> ProgressManager:
     """Factory function to create appropriate progress manager based on environment"""
@@ -458,7 +484,7 @@ def create_progress_manager(
         return MockProgressManager(plan, num_samples)
     if isinstance(plan, SentinelPlan):
-        assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
-        return PZSentinelProgressManager(plan, sample_budget)
+        assert sample_budget is not None or sample_cost_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
+        return PZSentinelProgressManager(plan, sample_budget, sample_cost_budget)
     return PZProgressManager(plan, num_samples)

{palimpzest-1.0.0 → palimpzest-1.1.0/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.0.0
+Version: 1.1.0
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org