PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
palimpzest-0.7.1.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0

palimpzest/utils/progress.py CHANGED Viewed

@@ -2,9 +2,13 @@ import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from chromadb.api.models.Collection import Collection
 from rich.console import Console
+from rich.live import Live
+from rich.panel import Panel
 from rich.progress import (
     BarColumn,
+    MofNCompleteColumn,
     SpinnerColumn,
     TaskProgressColumn,
     TextColumn,
@@ -12,13 +16,16 @@ from rich.progress import (
     TimeRemainingColumn,
 )
 from rich.progress import Progress as RichProgress
+from rich.table import Table
+from palimpzest.query.operators.aggregate import AggregateOp
+from palimpzest.query.operators.convert import LLMConvert
+from palimpzest.query.operators.filter import LLMFilter
+from palimpzest.query.operators.limit import LimitScanOp
+from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.retrieve import RetrieveOp
+from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
-try:
-    import ipywidgets as widgets
-    from IPython.display import display
-    JUPYTER_AVAILABLE = True
-except ImportError:
-    JUPYTER_AVAILABLE = False
 @dataclass
 class ProgressStats:
@@ -31,13 +38,6 @@ class ProgressStats:
     memory_usage_mb: float = 0.0
     recent_text: str = ""
-def in_jupyter_notebook():
-    try:
-        from IPython import get_ipython
-        return 'IPKernelApp' in get_ipython().config
-    except Exception:
-        return False
 def get_memory_usage() -> float:
     """Get current memory usage in MB"""
     try:
@@ -47,179 +47,395 @@ def get_memory_usage() -> float:
     except Exception:
         return 0.0
+# NOTE: right now we only need to support single plan execution; in a multi-plan setting, we will
+#       need to modify the semantics of the progress manager to support multiple plans
 class ProgressManager(ABC):
-    """Abstract base class for progress managers"""
-    def __init__(self):
-        self.stats = ProgressStats(start_time=time.time())
+    """Abstract base class for progress managers for plan execution"""
+    def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
+        """
+        Initialize the progress manager for the given plan. This function takes in a plan,
+        the number of samples to process (if specified).
+        If `num_samples` is None, then the entire DataReader will be scanned.
+        For each operator which is not an `AggregateOp` or `LimitScanOp`, we set its task `total`
+        to the number of inputs to be processed by the plan. As intermediate operators process
+        their inputs, the ProgressManager will update the `total` for their downstream operators.
+        """
+        # initialize progress object
+        self.progress = RichProgress(
+            SpinnerColumn(),
+            TextColumn("[bold blue]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            #TextColumn("[green]Success: {task.fields[success]}"),
+            #TextColumn("[red]Failed: {task.fields[failed]}"),
+            #TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
+            TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
+            TextColumn("\n[white]{task.fields[recent]}"),  # Recent text on new line
+            refresh_per_second=10,
+            expand=True,   # Use full width
+        )
+        # initialize mapping from op_id --> ProgressStats
+        self.op_id_to_stats: dict[str, ProgressStats] = {}
+        # initialize mapping from op_id --> task
+        self.op_id_to_task = {}
+        # initialize start time
+        self.start_time = None
+        # create mapping from op_id --> next_op
+        self.op_id_to_next_op: dict[str, PhysicalOperator] = {}
+        for op_idx, op in enumerate(plan.operators):
+            op_id = op.get_op_id()
+            next_op = plan.operators[op_idx + 1] if op_idx + 1 < len(plan.operators) else None
+            self.op_id_to_next_op[op_id] = next_op
+        # compute the total number of inputs to be processed by the plan
+        datareader_len = len(plan.operators[0].datareader)
+        total = datareader_len if num_samples is None else min(num_samples, datareader_len)
+        # add a task to the progress manager for each operator in the plan
+        for op in plan.operators:
+            # get the op id and a short string representation of the op; (str(op) is too long)
+            op_id = op.get_op_id()
+            op_str = f"{op.op_name()} ({op_id})"
+            # update the `total` if we encounter an AggregateOp or LimitScanOp
+            if isinstance(op, AggregateOp):
+                total = 1
+            elif isinstance(op, LimitScanOp):
+                total = op.limit
+            self.add_task(op_id, op_str, total)
+    def get_task_total(self, op_id: str) -> int:
+        """Return the current total value for the given task."""
+        task = self.op_id_to_task[op_id]
+        return self.progress._tasks[task].total
+    def get_task_description(self, op_id: str) -> str:
+        """Return the current description for the given task."""
+        task = self.op_id_to_task[op_id]
+        return self.progress._tasks[task].description
     @abstractmethod
-    def start(self, total: int):
-        """Initialize progress tracking with total items"""
+    def add_task(self, op_id: str, op_str: str, total: int):
+        """Initialize progress tracking for operator execution with total items"""
         pass
     @abstractmethod
-    def update(self, current: int, sample: str | None = None, **kwargs):
-        """Update progress with current count and optional sample"""
+    def start(self):
+        """Start the progress bar(s)"""
         pass
+    @abstractmethod
+    def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
+        """
+        Advance the progress bar for the given operator by one. Modify the downstream operators'
+        progress bar `total` to reflect the number of outputs produced by this operator.
+        NOTE: The semantics of this function are that every time it is executed we advance the
+        progress bar by 1. This is because the progress bar represents what fraction of the inputs
+        have been processed by the operator. `num_outputs` specifies how many outputs were generated
+        by the operator when processing the input for which `incr()` was called. E.g. a filter which
+        filters an input record will advance its progress bar by 1, but the next operator will now
+        have 1 fewer inputs to process. Alternatively, a convert which generates 3 `num_outputs` will
+        increase the inputs for the next operator by `delta = num_outputs - 1 = 2`.
+        """
+        pass
     @abstractmethod
     def finish(self):
         """Clean up and finalize progress tracking"""
         pass
-    def update_stats(self, **kwargs):
+    def update_stats(self, op_id: str, **kwargs):
         """Update progress statistics"""
         for key, value in kwargs.items():
-            if hasattr(self.stats, key):
-                setattr(self.stats, key, value)
-        self.stats.memory_usage_mb = get_memory_usage()
+            if hasattr(self.op_id_to_stats[op_id], key):
+                if key != "total_cost":
+                    setattr(self.op_id_to_stats[op_id], key, value)
+                else:
+                    self.op_id_to_stats[op_id].total_cost += value
+        self.op_id_to_stats[op_id].memory_usage_mb = get_memory_usage()
+class MockProgressManager(ProgressManager):
+    """Mock progress manager for testing purposes"""
+    def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
+        pass
+    def add_task(self, op_id: str, op_str: str, total: int):
+        pass
+    def start(self):
+        pass
-class CLIProgressManager(ProgressManager):
+    def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
+        pass
+    def finish(self):
+        pass
+class PZProgressManager(ProgressManager):
     """Progress manager for command line interface using rich"""
-    def __init__(self):
-        super().__init__()
+    def __init__(self, plan: PhysicalPlan, num_samples: int | None = None):
+        super().__init__(plan, num_samples)
         self.console = Console()
-        # Create single progress bar that includes both progress and recent text
-        self.progress = RichProgress(
+    def add_task(self, op_id: str, op_str: str, total: int):
+        """Add a new task to the progress bar"""
+        task = self.progress.add_task(
+            f"[blue]{op_str}",
+            total=total,
+            cost=0.0,
+            success=0,
+            failed=0,
+            memory=0.0,
+            recent="",
+        )
+        # store the mapping of operator ID to task ID
+        self.op_id_to_task[op_id] = task
+        # initialize the stats for this operation
+        self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
+    def start(self):
+        # print a newline before starting to separate from previous output
+        print()
+        # set start time
+        self.start_time = time.time()
+        # start progress bar
+        self.progress.start()
+    def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
+        # get the task for the given operation
+        task = self.op_id_to_task.get(op_id)
+        # update statistics with any additional keyword arguments
+        if kwargs != {}:
+            self.update_stats(op_id, **kwargs)
+        # update progress bar and recent text in one update
+        if display_text is not None:
+            self.op_id_to_stats[op_id].recent_text = display_text
+        # if num_outputs is not 1, update the downstream operators' progress bar total for any
+        # operator which is not an AggregateOp or LimitScanOp
+        delta = num_outputs - 1
+        if delta != 0:
+            next_op = self.op_id_to_next_op[op_id]
+            while next_op is not None:
+                if not isinstance(next_op, (AggregateOp, LimitScanOp)):
+                    next_op_id = next_op.get_op_id()
+                    next_task = self.op_id_to_task[next_op_id]
+                    self.progress.update(next_task, total=self.get_task_total(next_op_id) + delta)
+                next_op = self.op_id_to_next_op[next_op_id]
+        # advance the progress bar for this task
+        self.progress.update(
+            task,
+            advance=1,
+            description=f"[bold blue]{self.get_task_description(op_id)}",
+            cost=self.op_id_to_stats[op_id].total_cost,
+            success=self.op_id_to_stats[op_id].success_count,
+            failed=self.op_id_to_stats[op_id].failure_count,
+            memory=get_memory_usage(),
+            recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
+            refresh=True,
+        )
+    def finish(self):
+        self.progress.stop()
+        # compute total cost, success, and failure
+        total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
+        # success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
+        # failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
+        # Print final stats on new lines after progress display
+        print(f"Total time: {time.time() - self.start_time:.2f}s")
+        print(f"Total cost: ${total_cost:.4f}")
+        # print(f"Success rate: {success_count}/{success_count + failure_count}")
+class PZSentinelProgressManager(ProgressManager):
+    def __init__(self, plan: SentinelPlan, sample_budget: int):
+        # overall progress bar
+        self.overall_progress = RichProgress(
             SpinnerColumn(),
-            TextColumn("[bold blue]{task.description}"),
+            TextColumn("{task.description}"),  # TODO: fixed string?
             BarColumn(),
             TaskProgressColumn(),
+            MofNCompleteColumn(),
             TimeElapsedColumn(),
             TimeRemainingColumn(),
-            #TextColumn("[yellow]Cost: ${task.fields[cost]:.4f}"),
-            #TextColumn("[green]Success: {task.fields[success]}"),
-            #TextColumn("[red]Failed: {task.fields[failed]}"),
-            TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
+            TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
             TextColumn("\n[white]{task.fields[recent]}"),  # Recent text on new line
             refresh_per_second=10,
             expand=True,   # Use full width
         )
-        self.task_id = None
-    def start(self, total: int):
-        # Print a newline before starting to separate from previous output
-        print()
-        self.task_id = self.progress.add_task(
-            "Processing",
+        self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
+        # logical operator progress bars
+        self.op_progress = RichProgress(
+            SpinnerColumn(),
+            "{task.description}",
+            BarColumn(),
+            TaskProgressColumn(),
+            MofNCompleteColumn(),
+            TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
+            TextColumn("\n[white]{task.fields[recent]}"),  # Recent text on new line
+            refresh_per_second=10,
+            expand=True,   # Use full width
+        )
+        # organize progress bars into nice display
+        self.progress_table = Table.grid()
+        self.progress_table.add_row(
+            Panel.fit(self.op_progress, title="[b]Sample Allocation", border_style="red", padding=(1, 2)),
+        )
+        self.progress_table.add_row(
+            Panel.fit(
+                self.overall_progress, title="Optimization Progress", border_style="green", padding=(2, 2)
+            )
+        )
+        self.live_display = Live(self.progress_table, refresh_per_second=10)
+        # initialize mapping from op_id --> ProgressStats
+        self.op_id_to_stats: dict[str, ProgressStats] = {}
+        # initialize mapping from op_id --> task
+        self.op_id_to_task = {}
+        # initialize start time
+        self.start_time = None
+        # add a task to the progress manager for each operator in the plan
+        for logical_op_id, op_set in plan:
+            physical_op = op_set[0]
+            is_llm_convert = isinstance(physical_op, LLMConvert)
+            is_llm_filter = isinstance(physical_op, LLMFilter)
+            op_name = "LLMConvert" if is_llm_convert else "LLMFilter" if is_llm_filter else physical_op.op_name()
+            op_str = f"{op_name} ({logical_op_id})"
+            total = sample_budget if self._is_llm_op(op_set[0]) else 0
+            self.add_task(logical_op_id, op_str, total)
+        self.console = Console()
+    def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
+        is_llm_convert = isinstance(physical_op, LLMConvert)
+        is_llm_filter = isinstance(physical_op, LLMFilter)
+        is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
+        return is_llm_convert or is_llm_filter or is_llm_retrieve
+    def get_task_description(self, op_id: str) -> str:
+        """Return the current description for the given task."""
+        task = self.op_id_to_task[op_id]
+        return self.op_progress._tasks[task].description
+    def add_task(self, op_id: str, op_str: str, total: int):
+        """Add a new task to the op progress bars"""
+        task = self.op_progress.add_task(
+            f"[blue]{op_str}",
             total=total,
             cost=0.0,
             success=0,
             failed=0,
             memory=0.0,
-            recent=""
+            recent="",
         )
-        # Start progress bar
-        self.progress.start()
-    def update(self, current: int, sample: str | None = None, **kwargs):
-        self.update_stats(**kwargs)
-        # Update progress bar and recent text in one update
-        if sample:
-            self.stats.recent_text = sample
-        self.progress.update(
-            self.task_id,
-            completed=current,
-            description=f"[bold blue]{self.stats.current_operation}",
-            cost=self.stats.total_cost,
-            success=self.stats.success_count,
-            failed=self.stats.failure_count,
-            memory=self.stats.memory_usage_mb,
-            recent=f"Recent: {self.stats.recent_text}"
-        )
-    def finish(self):
-        self.progress.stop()
-        # Print final stats on new lines after progress display
-        print(f"Total time: {time.time() - self.stats.start_time:.2f}s")
-        #print(f"Total cost: ${self.stats.total_cost:.4f}")
-        print(f"Success rate: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}")
-class NotebookProgressManager(ProgressManager):
-    """Progress manager for Jupyter notebooks using ipywidgets"""
-    def __init__(self):
-        super().__init__()
-        if not JUPYTER_AVAILABLE:
-            raise ImportError("ipywidgets not available. Install with: pip install ipywidgets")
-        self.progress_bar = widgets.IntProgress(
-            value=0,
-            min=0,
-            description='Processing:',
-            bar_style='info',
-            orientation='horizontal'
-        )
-        self.stats_html = widgets.HTML(
-            value="<pre>Initializing...</pre>"
+        # store the mapping of operator ID to task ID
+        self.op_id_to_task[op_id] = task
+        # initialize the stats for this operation
+        self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
+    def start(self):
+        # print a newline before starting to separate from previous output
+        print()
+        # set start time
+        self.start_time = time.time()
+        # start progress bars
+        self.live_display.start()
+    def incr(self, op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
+        # TODO: (above) organize progress bars into a Live / Table / Panel or something
+        # get the task for the given operation
+        task = self.op_id_to_task.get(op_id)
+        # update statistics with any additional keyword arguments
+        if kwargs != {}:
+            self.update_stats(op_id, **kwargs)
+        # update progress bar and recent text in one update
+        if display_text is not None:
+            self.op_id_to_stats[op_id].recent_text = display_text
+        # advance the op progress bar for this op_id
+        self.op_progress.update(
+            task,
+            advance=num_samples,
+            description=f"[bold blue]{self.get_task_description(op_id)}",
+            cost=self.op_id_to_stats[op_id].total_cost,
+            success=self.op_id_to_stats[op_id].success_count,
+            failed=self.op_id_to_stats[op_id].failure_count,
+            memory=get_memory_usage(),
+            recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
+            refresh=True,
         )
-        self.recent_html = widgets.HTML(
-            value="<pre>Recent: </pre>"
+        # advance the overall progress bar
+        self.overall_progress.update(
+            self.overall_task_id,
+            advance=num_samples,
+            cost=sum(stats.total_cost for _, stats in self.op_id_to_stats.items()),
+            refresh=True,
         )
-        self.container = widgets.VBox([
-            self.progress_bar,
-            self.stats_html,
-            self.recent_html
-        ])
-    def start(self, total: int):
-        self.progress_bar.max = total
-        display(self.container)
-    def update(self, current: int, sample: str | None = None, **kwargs):
-        self.update_stats(**kwargs)
-        self.progress_bar.value = current
-        # Update stats display
-#        Total Cost: ${self.stats.total_cost:.4f}
-#        Success/Total: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}
-        stats_text = f"""
-        <pre>
-        Operation: {self.stats.current_operation}
-        Time Elapsed: {time.time() - self.stats.start_time:.1f}s
-        Memory Usage: {self.stats.memory_usage_mb:.1f}MB
-        </pre>
-        """
-        self.stats_html.value = stats_text
-        # Update recent text
-        if sample:
-            self.stats.recent_text = sample
-        self.recent_html.value = f"<pre>Recent: {self.stats.recent_text}</pre>"
+        # force the live display to refresh
+        self.live_display.refresh()
     def finish(self):
-        self.progress_bar.bar_style = 'success'
-        #
-        # Total Cost: ${self.stats.total_cost:.4f}
-        # Final Success Rate: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}
-        stats_text = f"""
-        <pre>
-        Completed!
-        Total Time: {time.time() - self.stats.start_time:.1f}s
-        Peak Memory Usage: {self.stats.memory_usage_mb:.1f}MB
-        </pre>
-        """
-        self.stats_html.value = stats_text
-        self.recent_html.value = "<pre>Completed</pre>"
+        self.live_display.stop()
+        # compute total cost, success, and failure
+        total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
+        # success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
+        # failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
-def create_progress_manager() -> ProgressManager:
+        # Print final stats on new lines after progress display
+        print(f"Total opt. time: {time.time() - self.start_time:.2f}s")
+        print(f"Total opt. cost: ${total_cost:.4f}")
+        # print(f"Success rate: {success_count}/{success_count + failure_count}")
+def create_progress_manager(
+    plan: PhysicalPlan | SentinelPlan,
+    num_samples: int | None = None,
+    sample_budget: int | None = None,
+    progress: bool = True,
+) -> ProgressManager:
     """Factory function to create appropriate progress manager based on environment"""
-    if in_jupyter_notebook():
-        try:
-            return NotebookProgressManager()
-        except ImportError:
-            return CLIProgressManager()
-    return CLIProgressManager()
+    if not progress:
+        return MockProgressManager(plan, num_samples)
+    if isinstance(plan, SentinelPlan):
+        assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
+        return PZSentinelProgressManager(plan, sample_budget)
+    return PZProgressManager(plan, num_samples)

palimpzest/utils/token_reduction_helpers.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import List
 from fuzzywuzzy import fuzz, process
@@ -89,7 +87,7 @@ def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=Tr
     return start * 1.0 / index_range, end * 1.0 / index_range
-def best_substring_match(query: str, context: str | List[str]):
+def best_substring_match(query: str, context: str | list[str]):
     # This will extract all substrings of length equal to the query from the string
     candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]

{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.6.4
+Version: 0.7.1
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -16,6 +16,7 @@ Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: charset-normalizer>=3.3.2
+Requires-Dist: chromadb>=0.6.3
 Requires-Dist: click>=8.1.7
 Requires-Dist: click-aliases>=1.0.4
 Requires-Dist: colorama>=0.4.6
@@ -54,19 +55,17 @@ Requires-Dist: pypdf>=5.1.0
 Requires-Dist: pytest-mock>=3.14.0
 Requires-Dist: python-Levenshtein>=0.25.1
 Requires-Dist: pyyaml>=6.0.1
+Requires-Dist: ragatouille>=0.0.9
 Requires-Dist: requests>=2.25
-Requires-Dist: requests-html>=0.10.0
 Requires-Dist: ruff>=0.9.0
-Requires-Dist: scikit-learn>=1.5.2
-Requires-Dist: scipy>=1.9.0
 Requires-Dist: setuptools>=70.1.1
 Requires-Dist: tabulate>=0.9.0
-Requires-Dist: tenacity>=8.2.3
 Requires-Dist: together>=1.3.1
 Requires-Dist: tqdm~=4.66.1
-Requires-Dist: transformers>=4.11.3
-Requires-Dist: requests-html
-Requires-Dist: sphinx>=8.1.3
+Requires-Dist: transformers<4.50.0,>=4.41.3
+Requires-Dist: rich[jupyter]>=13.9.2
+Requires-Dist: voyager>=2.0.9
+Dynamic: license-file
 ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
@@ -163,3 +162,14 @@ Now you can run the simple test program with:
 ```bash
 $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
 ```
+### Citation
+If you would like to cite our work, please use the following citation:
+```
+@inproceedings{palimpzestCIDR,
+    title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
+    author={Liu, Chunwei and Russo, Matthew and Cafarella, Michael and Cao, Lei and Chen, Peter Baile and Chen, Zui and Franklin, Michael and Kraska, Tim and Madden, Samuel and Shahout, Rana and Vitagliano, Gerardo},
+    booktitle = {Proceedings of the {{Conference}} on {{Innovative Database Research}} ({{CIDR}})},
+    date = 2025,
+}
+```

palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl