palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.1.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,13 @@ import time
2
2
  from abc import ABC, abstractmethod
3
3
  from dataclasses import dataclass
4
4
 
5
+ from chromadb.api.models.Collection import Collection
5
6
  from rich.console import Console
7
+ from rich.live import Live
8
+ from rich.panel import Panel
6
9
  from rich.progress import (
7
10
  BarColumn,
11
+ MofNCompleteColumn,
8
12
  SpinnerColumn,
9
13
  TaskProgressColumn,
10
14
  TextColumn,
@@ -12,13 +16,16 @@ from rich.progress import (
12
16
  TimeRemainingColumn,
13
17
  )
14
18
  from rich.progress import Progress as RichProgress
19
+ from rich.table import Table
20
+
21
+ from palimpzest.query.operators.aggregate import AggregateOp
22
+ from palimpzest.query.operators.convert import LLMConvert
23
+ from palimpzest.query.operators.filter import LLMFilter
24
+ from palimpzest.query.operators.limit import LimitScanOp
25
+ from palimpzest.query.operators.physical import PhysicalOperator
26
+ from palimpzest.query.operators.retrieve import RetrieveOp
27
+ from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
15
28
 
16
- try:
17
- import ipywidgets as widgets
18
- from IPython.display import display
19
- JUPYTER_AVAILABLE = True
20
- except ImportError:
21
- JUPYTER_AVAILABLE = False
22
29
 
23
30
  @dataclass
24
31
  class ProgressStats:
@@ -31,13 +38,6 @@ class ProgressStats:
31
38
  memory_usage_mb: float = 0.0
32
39
  recent_text: str = ""
33
40
 
34
- def in_jupyter_notebook():
35
- try:
36
- from IPython import get_ipython
37
- return 'IPKernelApp' in get_ipython().config
38
- except Exception:
39
- return False
40
-
41
41
  def get_memory_usage() -> float:
42
42
  """Get current memory usage in MB"""
43
43
  try:
@@ -47,179 +47,395 @@ def get_memory_usage() -> float:
47
47
  except Exception:
48
48
  return 0.0
49
49
 
50
+ # NOTE: right now we only need to support single plan execution; in a multi-plan setting, we will
51
+ # need to modify the semantics of the progress manager to support multiple plans
50
52
  class ProgressManager(ABC):
51
- """Abstract base class for progress managers"""
52
-
53
- def __init__(self):
54
- self.stats = ProgressStats(start_time=time.time())
55
-
53
+ """Abstract base class for progress managers for plan execution"""
54
+
55
+ def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
56
+ """
57
+ Initialize the progress manager for the given plan. This function takes in a plan,
58
+ the number of samples to process (if specified).
59
+
60
+ If `num_samples` is None, then the entire DataReader will be scanned.
61
+
62
+ For each operator which is not an `AggregateOp` or `LimitScanOp`, we set its task `total`
63
+ to the number of inputs to be processed by the plan. As intermediate operators process
64
+ their inputs, the ProgressManager will update the `total` for their downstream operators.
65
+ """
66
+ # initialize progress object
67
+ self.progress = RichProgress(
68
+ SpinnerColumn(),
69
+ TextColumn("[bold blue]{task.description}"),
70
+ BarColumn(),
71
+ TaskProgressColumn(),
72
+ MofNCompleteColumn(),
73
+ TimeElapsedColumn(),
74
+ TimeRemainingColumn(),
75
+ #TextColumn("[green]Success: {task.fields[success]}"),
76
+ #TextColumn("[red]Failed: {task.fields[failed]}"),
77
+ #TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
78
+ TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
79
+ TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
80
+ refresh_per_second=10,
81
+ expand=True, # Use full width
82
+ )
83
+
84
+ # initialize mapping from op_id --> ProgressStats
85
+ self.op_id_to_stats: dict[str, ProgressStats] = {}
86
+
87
+ # initialize mapping from op_id --> task
88
+ self.op_id_to_task = {}
89
+
90
+ # initialize start time
91
+ self.start_time = None
92
+
93
+ # create mapping from op_id --> next_op
94
+ self.op_id_to_next_op: dict[str, PhysicalOperator] = {}
95
+ for op_idx, op in enumerate(plan.operators):
96
+ op_id = op.get_op_id()
97
+ next_op = plan.operators[op_idx + 1] if op_idx + 1 < len(plan.operators) else None
98
+ self.op_id_to_next_op[op_id] = next_op
99
+
100
+ # compute the total number of inputs to be processed by the plan
101
+ datareader_len = len(plan.operators[0].datareader)
102
+ total = datareader_len if num_samples is None else min(num_samples, datareader_len)
103
+
104
+ # add a task to the progress manager for each operator in the plan
105
+ for op in plan.operators:
106
+ # get the op id and a short string representation of the op; (str(op) is too long)
107
+ op_id = op.get_op_id()
108
+ op_str = f"{op.op_name()} ({op_id})"
109
+
110
+ # update the `total` if we encounter an AggregateOp or LimitScanOp
111
+ if isinstance(op, AggregateOp):
112
+ total = 1
113
+ elif isinstance(op, LimitScanOp):
114
+ total = op.limit
115
+
116
+ self.add_task(op_id, op_str, total)
117
+
118
+ def get_task_total(self, op_id: str) -> int:
119
+ """Return the current total value for the given task."""
120
+ task = self.op_id_to_task[op_id]
121
+ return self.progress._tasks[task].total
122
+
123
+ def get_task_description(self, op_id: str) -> str:
124
+ """Return the current description for the given task."""
125
+ task = self.op_id_to_task[op_id]
126
+ return self.progress._tasks[task].description
127
+
56
128
  @abstractmethod
57
- def start(self, total: int):
58
- """Initialize progress tracking with total items"""
129
+ def add_task(self, op_id: str, op_str: str, total: int):
130
+ """Initialize progress tracking for operator execution with total items"""
59
131
  pass
60
-
132
+
61
133
  @abstractmethod
62
- def update(self, current: int, sample: str | None = None, **kwargs):
63
- """Update progress with current count and optional sample"""
134
+ def start(self):
135
+ """Start the progress bar(s)"""
64
136
  pass
65
-
137
+
138
+ @abstractmethod
139
+ def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
140
+ """
141
+ Advance the progress bar for the given operator by one. Modify the downstream operators'
142
+ progress bar `total` to reflect the number of outputs produced by this operator.
143
+
144
+ NOTE: The semantics of this function are that every time it is executed we advance the
145
+ progress bar by 1. This is because the progress bar represents what fraction of the inputs
146
+ have been processed by the operator. `num_outputs` specifies how many outputs were generated
147
+ by the operator when processing the input for which `incr()` was called. E.g. a filter which
148
+ filters an input record will advance its progress bar by 1, but the next operator will now
149
+ have 1 fewer inputs to process. Alternatively, a convert which generates 3 `num_outputs` will
150
+ increase the inputs for the next operator by `delta = num_outputs - 1 = 2`.
151
+ """
152
+ pass
153
+
66
154
  @abstractmethod
67
155
  def finish(self):
68
156
  """Clean up and finalize progress tracking"""
69
157
  pass
70
158
 
71
- def update_stats(self, **kwargs):
159
+ def update_stats(self, op_id: str, **kwargs):
72
160
  """Update progress statistics"""
73
161
  for key, value in kwargs.items():
74
- if hasattr(self.stats, key):
75
- setattr(self.stats, key, value)
76
- self.stats.memory_usage_mb = get_memory_usage()
162
+ if hasattr(self.op_id_to_stats[op_id], key):
163
+ if key != "total_cost":
164
+ setattr(self.op_id_to_stats[op_id], key, value)
165
+ else:
166
+ self.op_id_to_stats[op_id].total_cost += value
167
+ self.op_id_to_stats[op_id].memory_usage_mb = get_memory_usage()
168
+
169
+
170
+ class MockProgressManager(ProgressManager):
171
+ """Mock progress manager for testing purposes"""
172
+
173
+ def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
174
+ pass
175
+
176
+ def add_task(self, op_id: str, op_str: str, total: int):
177
+ pass
178
+
179
+ def start(self):
180
+ pass
77
181
 
78
- class CLIProgressManager(ProgressManager):
182
+ def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
183
+ pass
184
+
185
+ def finish(self):
186
+ pass
187
+
188
+ class PZProgressManager(ProgressManager):
79
189
  """Progress manager for command line interface using rich"""
80
190
 
81
- def __init__(self):
82
- super().__init__()
191
+ def __init__(self, plan: PhysicalPlan, num_samples: int | None = None):
192
+ super().__init__(plan, num_samples)
83
193
  self.console = Console()
84
-
85
- # Create single progress bar that includes both progress and recent text
86
- self.progress = RichProgress(
194
+
195
+ def add_task(self, op_id: str, op_str: str, total: int):
196
+ """Add a new task to the progress bar"""
197
+ task = self.progress.add_task(
198
+ f"[blue]{op_str}",
199
+ total=total,
200
+ cost=0.0,
201
+ success=0,
202
+ failed=0,
203
+ memory=0.0,
204
+ recent="",
205
+ )
206
+
207
+ # store the mapping of operator ID to task ID
208
+ self.op_id_to_task[op_id] = task
209
+
210
+ # initialize the stats for this operation
211
+ self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
212
+
213
+ def start(self):
214
+ # print a newline before starting to separate from previous output
215
+ print()
216
+
217
+ # set start time
218
+ self.start_time = time.time()
219
+
220
+ # start progress bar
221
+ self.progress.start()
222
+
223
+ def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
224
+ # get the task for the given operation
225
+ task = self.op_id_to_task.get(op_id)
226
+
227
+ # update statistics with any additional keyword arguments
228
+ if kwargs != {}:
229
+ self.update_stats(op_id, **kwargs)
230
+
231
+ # update progress bar and recent text in one update
232
+ if display_text is not None:
233
+ self.op_id_to_stats[op_id].recent_text = display_text
234
+
235
+ # if num_outputs is not 1, update the downstream operators' progress bar total for any
236
+ # operator which is not an AggregateOp or LimitScanOp
237
+ delta = num_outputs - 1
238
+ if delta != 0:
239
+ next_op = self.op_id_to_next_op[op_id]
240
+ while next_op is not None:
241
+ if not isinstance(next_op, (AggregateOp, LimitScanOp)):
242
+ next_op_id = next_op.get_op_id()
243
+ next_task = self.op_id_to_task[next_op_id]
244
+ self.progress.update(next_task, total=self.get_task_total(next_op_id) + delta)
245
+
246
+ next_op = self.op_id_to_next_op[next_op_id]
247
+
248
+ # advance the progress bar for this task
249
+ self.progress.update(
250
+ task,
251
+ advance=1,
252
+ description=f"[bold blue]{self.get_task_description(op_id)}",
253
+ cost=self.op_id_to_stats[op_id].total_cost,
254
+ success=self.op_id_to_stats[op_id].success_count,
255
+ failed=self.op_id_to_stats[op_id].failure_count,
256
+ memory=get_memory_usage(),
257
+ recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
258
+ refresh=True,
259
+ )
260
+
261
+ def finish(self):
262
+ self.progress.stop()
263
+
264
+ # compute total cost, success, and failure
265
+ total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
266
+ # success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
267
+ # failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
268
+
269
+ # Print final stats on new lines after progress display
270
+ print(f"Total time: {time.time() - self.start_time:.2f}s")
271
+ print(f"Total cost: ${total_cost:.4f}")
272
+ # print(f"Success rate: {success_count}/{success_count + failure_count}")
273
+
274
+ class PZSentinelProgressManager(ProgressManager):
275
+ def __init__(self, plan: SentinelPlan, sample_budget: int):
276
+ # overall progress bar
277
+ self.overall_progress = RichProgress(
87
278
  SpinnerColumn(),
88
- TextColumn("[bold blue]{task.description}"),
279
+ TextColumn("{task.description}"), # TODO: fixed string?
89
280
  BarColumn(),
90
281
  TaskProgressColumn(),
282
+ MofNCompleteColumn(),
91
283
  TimeElapsedColumn(),
92
284
  TimeRemainingColumn(),
93
- #TextColumn("[yellow]Cost: ${task.fields[cost]:.4f}"),
94
- #TextColumn("[green]Success: {task.fields[success]}"),
95
- #TextColumn("[red]Failed: {task.fields[failed]}"),
96
- TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
285
+ TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
97
286
  TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
98
287
  refresh_per_second=10,
99
288
  expand=True, # Use full width
100
289
  )
101
- self.task_id = None
102
-
103
- def start(self, total: int):
104
- # Print a newline before starting to separate from previous output
105
- print()
106
-
107
- self.task_id = self.progress.add_task(
108
- "Processing",
290
+ self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
291
+
292
+ # logical operator progress bars
293
+ self.op_progress = RichProgress(
294
+ SpinnerColumn(),
295
+ "{task.description}",
296
+ BarColumn(),
297
+ TaskProgressColumn(),
298
+ MofNCompleteColumn(),
299
+ TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
300
+ TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
301
+ refresh_per_second=10,
302
+ expand=True, # Use full width
303
+ )
304
+
305
+ # organize progress bars into nice display
306
+ self.progress_table = Table.grid()
307
+ self.progress_table.add_row(
308
+ Panel.fit(self.op_progress, title="[b]Sample Allocation", border_style="red", padding=(1, 2)),
309
+ )
310
+ self.progress_table.add_row(
311
+ Panel.fit(
312
+ self.overall_progress, title="Optimization Progress", border_style="green", padding=(2, 2)
313
+ )
314
+ )
315
+ self.live_display = Live(self.progress_table, refresh_per_second=10)
316
+
317
+ # initialize mapping from op_id --> ProgressStats
318
+ self.op_id_to_stats: dict[str, ProgressStats] = {}
319
+
320
+ # initialize mapping from op_id --> task
321
+ self.op_id_to_task = {}
322
+
323
+ # initialize start time
324
+ self.start_time = None
325
+
326
+ # add a task to the progress manager for each operator in the plan
327
+ for logical_op_id, op_set in plan:
328
+ physical_op = op_set[0]
329
+ is_llm_convert = isinstance(physical_op, LLMConvert)
330
+ is_llm_filter = isinstance(physical_op, LLMFilter)
331
+ op_name = "LLMConvert" if is_llm_convert else "LLMFilter" if is_llm_filter else physical_op.op_name()
332
+ op_str = f"{op_name} ({logical_op_id})"
333
+ total = sample_budget if self._is_llm_op(op_set[0]) else 0
334
+ self.add_task(logical_op_id, op_str, total)
335
+
336
+ self.console = Console()
337
+
338
+ def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
339
+ is_llm_convert = isinstance(physical_op, LLMConvert)
340
+ is_llm_filter = isinstance(physical_op, LLMFilter)
341
+ is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
342
+ return is_llm_convert or is_llm_filter or is_llm_retrieve
343
+
344
+ def get_task_description(self, op_id: str) -> str:
345
+ """Return the current description for the given task."""
346
+ task = self.op_id_to_task[op_id]
347
+ return self.op_progress._tasks[task].description
348
+
349
+ def add_task(self, op_id: str, op_str: str, total: int):
350
+ """Add a new task to the op progress bars"""
351
+ task = self.op_progress.add_task(
352
+ f"[blue]{op_str}",
109
353
  total=total,
110
354
  cost=0.0,
111
355
  success=0,
112
356
  failed=0,
113
357
  memory=0.0,
114
- recent=""
358
+ recent="",
115
359
  )
116
-
117
- # Start progress bar
118
- self.progress.start()
119
-
120
- def update(self, current: int, sample: str | None = None, **kwargs):
121
- self.update_stats(**kwargs)
122
-
123
- # Update progress bar and recent text in one update
124
- if sample:
125
- self.stats.recent_text = sample
126
-
127
- self.progress.update(
128
- self.task_id,
129
- completed=current,
130
- description=f"[bold blue]{self.stats.current_operation}",
131
- cost=self.stats.total_cost,
132
- success=self.stats.success_count,
133
- failed=self.stats.failure_count,
134
- memory=self.stats.memory_usage_mb,
135
- recent=f"Recent: {self.stats.recent_text}"
136
- )
137
-
138
- def finish(self):
139
- self.progress.stop()
140
-
141
- # Print final stats on new lines after progress display
142
- print(f"Total time: {time.time() - self.stats.start_time:.2f}s")
143
- #print(f"Total cost: ${self.stats.total_cost:.4f}")
144
- print(f"Success rate: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}")
145
360
 
146
- class NotebookProgressManager(ProgressManager):
147
- """Progress manager for Jupyter notebooks using ipywidgets"""
148
-
149
- def __init__(self):
150
- super().__init__()
151
- if not JUPYTER_AVAILABLE:
152
- raise ImportError("ipywidgets not available. Install with: pip install ipywidgets")
153
-
154
- self.progress_bar = widgets.IntProgress(
155
- value=0,
156
- min=0,
157
- description='Processing:',
158
- bar_style='info',
159
- orientation='horizontal'
160
- )
161
-
162
- self.stats_html = widgets.HTML(
163
- value="<pre>Initializing...</pre>"
361
+ # store the mapping of operator ID to task ID
362
+ self.op_id_to_task[op_id] = task
363
+
364
+ # initialize the stats for this operation
365
+ self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
366
+
367
+ def start(self):
368
+ # print a newline before starting to separate from previous output
369
+ print()
370
+
371
+ # set start time
372
+ self.start_time = time.time()
373
+
374
+ # start progress bars
375
+ self.live_display.start()
376
+
377
+ def incr(self, op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
378
+ # TODO: (above) organize progress bars into a Live / Table / Panel or something
379
+ # get the task for the given operation
380
+ task = self.op_id_to_task.get(op_id)
381
+
382
+ # update statistics with any additional keyword arguments
383
+ if kwargs != {}:
384
+ self.update_stats(op_id, **kwargs)
385
+
386
+ # update progress bar and recent text in one update
387
+ if display_text is not None:
388
+ self.op_id_to_stats[op_id].recent_text = display_text
389
+
390
+ # advance the op progress bar for this op_id
391
+ self.op_progress.update(
392
+ task,
393
+ advance=num_samples,
394
+ description=f"[bold blue]{self.get_task_description(op_id)}",
395
+ cost=self.op_id_to_stats[op_id].total_cost,
396
+ success=self.op_id_to_stats[op_id].success_count,
397
+ failed=self.op_id_to_stats[op_id].failure_count,
398
+ memory=get_memory_usage(),
399
+ recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
400
+ refresh=True,
164
401
  )
165
-
166
- self.recent_html = widgets.HTML(
167
- value="<pre>Recent: </pre>"
402
+
403
+ # advance the overall progress bar
404
+ self.overall_progress.update(
405
+ self.overall_task_id,
406
+ advance=num_samples,
407
+ cost=sum(stats.total_cost for _, stats in self.op_id_to_stats.items()),
408
+ refresh=True,
168
409
  )
169
-
170
- self.container = widgets.VBox([
171
- self.progress_bar,
172
- self.stats_html,
173
- self.recent_html
174
- ])
175
-
176
- def start(self, total: int):
177
- self.progress_bar.max = total
178
- display(self.container)
179
-
180
- def update(self, current: int, sample: str | None = None, **kwargs):
181
- self.update_stats(**kwargs)
182
- self.progress_bar.value = current
183
-
184
- # Update stats display
185
- # Total Cost: ${self.stats.total_cost:.4f}
186
- # Success/Total: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}
187
-
188
-
189
- stats_text = f"""
190
- <pre>
191
- Operation: {self.stats.current_operation}
192
- Time Elapsed: {time.time() - self.stats.start_time:.1f}s
193
- Memory Usage: {self.stats.memory_usage_mb:.1f}MB
194
- </pre>
195
- """
196
- self.stats_html.value = stats_text
197
-
198
- # Update recent text
199
- if sample:
200
- self.stats.recent_text = sample
201
- self.recent_html.value = f"<pre>Recent: {self.stats.recent_text}</pre>"
202
-
410
+
411
+ # force the live display to refresh
412
+ self.live_display.refresh()
413
+
203
414
  def finish(self):
204
- self.progress_bar.bar_style = 'success'
205
- #
206
- # Total Cost: ${self.stats.total_cost:.4f}
207
- # Final Success Rate: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}
208
- stats_text = f"""
209
- <pre>
210
- Completed!
211
- Total Time: {time.time() - self.stats.start_time:.1f}s
212
- Peak Memory Usage: {self.stats.memory_usage_mb:.1f}MB
213
- </pre>
214
- """
215
- self.stats_html.value = stats_text
216
- self.recent_html.value = "<pre>Completed</pre>"
415
+ self.live_display.stop()
416
+
417
+ # compute total cost, success, and failure
418
+ total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
419
+ # success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
420
+ # failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
217
421
 
218
- def create_progress_manager() -> ProgressManager:
422
+ # Print final stats on new lines after progress display
423
+ print(f"Total opt. time: {time.time() - self.start_time:.2f}s")
424
+ print(f"Total opt. cost: ${total_cost:.4f}")
425
+ # print(f"Success rate: {success_count}/{success_count + failure_count}")
426
+
427
+ def create_progress_manager(
428
+ plan: PhysicalPlan | SentinelPlan,
429
+ num_samples: int | None = None,
430
+ sample_budget: int | None = None,
431
+ progress: bool = True,
432
+ ) -> ProgressManager:
219
433
  """Factory function to create appropriate progress manager based on environment"""
220
- if in_jupyter_notebook():
221
- try:
222
- return NotebookProgressManager()
223
- except ImportError:
224
- return CLIProgressManager()
225
- return CLIProgressManager()
434
+ if not progress:
435
+ return MockProgressManager(plan, num_samples)
436
+
437
+ if isinstance(plan, SentinelPlan):
438
+ assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
439
+ return PZSentinelProgressManager(plan, sample_budget)
440
+
441
+ return PZProgressManager(plan, num_samples)
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  from fuzzywuzzy import fuzz, process
4
2
 
5
3
 
@@ -89,7 +87,7 @@ def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=Tr
89
87
  return start * 1.0 / index_range, end * 1.0 / index_range
90
88
 
91
89
 
92
- def best_substring_match(query: str, context: str | List[str]):
90
+ def best_substring_match(query: str, context: str | list[str]):
93
91
  # This will extract all substrings of length equal to the query from the string
94
92
  candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]
95
93
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.6.4
3
+ Version: 0.7.1
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -16,6 +16,7 @@ Requires-Python: >=3.8
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: charset-normalizer>=3.3.2
19
+ Requires-Dist: chromadb>=0.6.3
19
20
  Requires-Dist: click>=8.1.7
20
21
  Requires-Dist: click-aliases>=1.0.4
21
22
  Requires-Dist: colorama>=0.4.6
@@ -54,19 +55,17 @@ Requires-Dist: pypdf>=5.1.0
54
55
  Requires-Dist: pytest-mock>=3.14.0
55
56
  Requires-Dist: python-Levenshtein>=0.25.1
56
57
  Requires-Dist: pyyaml>=6.0.1
58
+ Requires-Dist: ragatouille>=0.0.9
57
59
  Requires-Dist: requests>=2.25
58
- Requires-Dist: requests-html>=0.10.0
59
60
  Requires-Dist: ruff>=0.9.0
60
- Requires-Dist: scikit-learn>=1.5.2
61
- Requires-Dist: scipy>=1.9.0
62
61
  Requires-Dist: setuptools>=70.1.1
63
62
  Requires-Dist: tabulate>=0.9.0
64
- Requires-Dist: tenacity>=8.2.3
65
63
  Requires-Dist: together>=1.3.1
66
64
  Requires-Dist: tqdm~=4.66.1
67
- Requires-Dist: transformers>=4.11.3
68
- Requires-Dist: requests-html
69
- Requires-Dist: sphinx>=8.1.3
65
+ Requires-Dist: transformers<4.50.0,>=4.41.3
66
+ Requires-Dist: rich[jupyter]>=13.9.2
67
+ Requires-Dist: voyager>=2.0.9
68
+ Dynamic: license-file
70
69
 
71
70
  ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
72
71
 
@@ -163,3 +162,14 @@ Now you can run the simple test program with:
163
162
  ```bash
164
163
  $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
165
164
  ```
165
+
166
+ ### Citation
167
+ If you would like to cite our work, please use the following citation:
168
+ ```
169
+ @inproceedings{palimpzestCIDR,
170
+ title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
171
+ author={Liu, Chunwei and Russo, Matthew and Cafarella, Michael and Cao, Lei and Chen, Peter Baile and Chen, Zui and Franklin, Michael and Kraska, Tim and Madden, Samuel and Shahout, Rana and Vitagliano, Gerardo},
172
+ booktitle = {Proceedings of the {{Conference}} on {{Innovative Database Research}} ({{CIDR}})},
173
+ date = 2025,
174
+ }
175
+ ```