palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
- palimpzest-0.7.1.dist-info/RECORD +96 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.4.dist-info/RECORD +0 -87
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
palimpzest/utils/progress.py
CHANGED
|
@@ -2,9 +2,13 @@ import time
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
|
|
5
|
+
from chromadb.api.models.Collection import Collection
|
|
5
6
|
from rich.console import Console
|
|
7
|
+
from rich.live import Live
|
|
8
|
+
from rich.panel import Panel
|
|
6
9
|
from rich.progress import (
|
|
7
10
|
BarColumn,
|
|
11
|
+
MofNCompleteColumn,
|
|
8
12
|
SpinnerColumn,
|
|
9
13
|
TaskProgressColumn,
|
|
10
14
|
TextColumn,
|
|
@@ -12,13 +16,16 @@ from rich.progress import (
|
|
|
12
16
|
TimeRemainingColumn,
|
|
13
17
|
)
|
|
14
18
|
from rich.progress import Progress as RichProgress
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
|
|
21
|
+
from palimpzest.query.operators.aggregate import AggregateOp
|
|
22
|
+
from palimpzest.query.operators.convert import LLMConvert
|
|
23
|
+
from palimpzest.query.operators.filter import LLMFilter
|
|
24
|
+
from palimpzest.query.operators.limit import LimitScanOp
|
|
25
|
+
from palimpzest.query.operators.physical import PhysicalOperator
|
|
26
|
+
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
27
|
+
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
15
28
|
|
|
16
|
-
try:
|
|
17
|
-
import ipywidgets as widgets
|
|
18
|
-
from IPython.display import display
|
|
19
|
-
JUPYTER_AVAILABLE = True
|
|
20
|
-
except ImportError:
|
|
21
|
-
JUPYTER_AVAILABLE = False
|
|
22
29
|
|
|
23
30
|
@dataclass
|
|
24
31
|
class ProgressStats:
|
|
@@ -31,13 +38,6 @@ class ProgressStats:
|
|
|
31
38
|
memory_usage_mb: float = 0.0
|
|
32
39
|
recent_text: str = ""
|
|
33
40
|
|
|
34
|
-
def in_jupyter_notebook():
|
|
35
|
-
try:
|
|
36
|
-
from IPython import get_ipython
|
|
37
|
-
return 'IPKernelApp' in get_ipython().config
|
|
38
|
-
except Exception:
|
|
39
|
-
return False
|
|
40
|
-
|
|
41
41
|
def get_memory_usage() -> float:
|
|
42
42
|
"""Get current memory usage in MB"""
|
|
43
43
|
try:
|
|
@@ -47,179 +47,395 @@ def get_memory_usage() -> float:
|
|
|
47
47
|
except Exception:
|
|
48
48
|
return 0.0
|
|
49
49
|
|
|
50
|
+
# NOTE: right now we only need to support single plan execution; in a multi-plan setting, we will
|
|
51
|
+
# need to modify the semantics of the progress manager to support multiple plans
|
|
50
52
|
class ProgressManager(ABC):
|
|
51
|
-
"""Abstract base class for progress managers"""
|
|
52
|
-
|
|
53
|
-
def __init__(self):
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
"""Abstract base class for progress managers for plan execution"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
|
|
56
|
+
"""
|
|
57
|
+
Initialize the progress manager for the given plan. This function takes in a plan,
|
|
58
|
+
the number of samples to process (if specified).
|
|
59
|
+
|
|
60
|
+
If `num_samples` is None, then the entire DataReader will be scanned.
|
|
61
|
+
|
|
62
|
+
For each operator which is not an `AggregateOp` or `LimitScanOp`, we set its task `total`
|
|
63
|
+
to the number of inputs to be processed by the plan. As intermediate operators process
|
|
64
|
+
their inputs, the ProgressManager will update the `total` for their downstream operators.
|
|
65
|
+
"""
|
|
66
|
+
# initialize progress object
|
|
67
|
+
self.progress = RichProgress(
|
|
68
|
+
SpinnerColumn(),
|
|
69
|
+
TextColumn("[bold blue]{task.description}"),
|
|
70
|
+
BarColumn(),
|
|
71
|
+
TaskProgressColumn(),
|
|
72
|
+
MofNCompleteColumn(),
|
|
73
|
+
TimeElapsedColumn(),
|
|
74
|
+
TimeRemainingColumn(),
|
|
75
|
+
#TextColumn("[green]Success: {task.fields[success]}"),
|
|
76
|
+
#TextColumn("[red]Failed: {task.fields[failed]}"),
|
|
77
|
+
#TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
|
|
78
|
+
TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
|
|
79
|
+
TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
|
|
80
|
+
refresh_per_second=10,
|
|
81
|
+
expand=True, # Use full width
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# initialize mapping from op_id --> ProgressStats
|
|
85
|
+
self.op_id_to_stats: dict[str, ProgressStats] = {}
|
|
86
|
+
|
|
87
|
+
# initialize mapping from op_id --> task
|
|
88
|
+
self.op_id_to_task = {}
|
|
89
|
+
|
|
90
|
+
# initialize start time
|
|
91
|
+
self.start_time = None
|
|
92
|
+
|
|
93
|
+
# create mapping from op_id --> next_op
|
|
94
|
+
self.op_id_to_next_op: dict[str, PhysicalOperator] = {}
|
|
95
|
+
for op_idx, op in enumerate(plan.operators):
|
|
96
|
+
op_id = op.get_op_id()
|
|
97
|
+
next_op = plan.operators[op_idx + 1] if op_idx + 1 < len(plan.operators) else None
|
|
98
|
+
self.op_id_to_next_op[op_id] = next_op
|
|
99
|
+
|
|
100
|
+
# compute the total number of inputs to be processed by the plan
|
|
101
|
+
datareader_len = len(plan.operators[0].datareader)
|
|
102
|
+
total = datareader_len if num_samples is None else min(num_samples, datareader_len)
|
|
103
|
+
|
|
104
|
+
# add a task to the progress manager for each operator in the plan
|
|
105
|
+
for op in plan.operators:
|
|
106
|
+
# get the op id and a short string representation of the op; (str(op) is too long)
|
|
107
|
+
op_id = op.get_op_id()
|
|
108
|
+
op_str = f"{op.op_name()} ({op_id})"
|
|
109
|
+
|
|
110
|
+
# update the `total` if we encounter an AggregateOp or LimitScanOp
|
|
111
|
+
if isinstance(op, AggregateOp):
|
|
112
|
+
total = 1
|
|
113
|
+
elif isinstance(op, LimitScanOp):
|
|
114
|
+
total = op.limit
|
|
115
|
+
|
|
116
|
+
self.add_task(op_id, op_str, total)
|
|
117
|
+
|
|
118
|
+
def get_task_total(self, op_id: str) -> int:
|
|
119
|
+
"""Return the current total value for the given task."""
|
|
120
|
+
task = self.op_id_to_task[op_id]
|
|
121
|
+
return self.progress._tasks[task].total
|
|
122
|
+
|
|
123
|
+
def get_task_description(self, op_id: str) -> str:
|
|
124
|
+
"""Return the current description for the given task."""
|
|
125
|
+
task = self.op_id_to_task[op_id]
|
|
126
|
+
return self.progress._tasks[task].description
|
|
127
|
+
|
|
56
128
|
@abstractmethod
|
|
57
|
-
def
|
|
58
|
-
"""Initialize progress tracking with total items"""
|
|
129
|
+
def add_task(self, op_id: str, op_str: str, total: int):
|
|
130
|
+
"""Initialize progress tracking for operator execution with total items"""
|
|
59
131
|
pass
|
|
60
|
-
|
|
132
|
+
|
|
61
133
|
@abstractmethod
|
|
62
|
-
def
|
|
63
|
-
"""
|
|
134
|
+
def start(self):
|
|
135
|
+
"""Start the progress bar(s)"""
|
|
64
136
|
pass
|
|
65
|
-
|
|
137
|
+
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
140
|
+
"""
|
|
141
|
+
Advance the progress bar for the given operator by one. Modify the downstream operators'
|
|
142
|
+
progress bar `total` to reflect the number of outputs produced by this operator.
|
|
143
|
+
|
|
144
|
+
NOTE: The semantics of this function are that every time it is executed we advance the
|
|
145
|
+
progress bar by 1. This is because the progress bar represents what fraction of the inputs
|
|
146
|
+
have been processed by the operator. `num_outputs` specifies how many outputs were generated
|
|
147
|
+
by the operator when processing the input for which `incr()` was called. E.g. a filter which
|
|
148
|
+
filters an input record will advance its progress bar by 1, but the next operator will now
|
|
149
|
+
have 1 fewer inputs to process. Alternatively, a convert which generates 3 `num_outputs` will
|
|
150
|
+
increase the inputs for the next operator by `delta = num_outputs - 1 = 2`.
|
|
151
|
+
"""
|
|
152
|
+
pass
|
|
153
|
+
|
|
66
154
|
@abstractmethod
|
|
67
155
|
def finish(self):
|
|
68
156
|
"""Clean up and finalize progress tracking"""
|
|
69
157
|
pass
|
|
70
158
|
|
|
71
|
-
def update_stats(self, **kwargs):
|
|
159
|
+
def update_stats(self, op_id: str, **kwargs):
|
|
72
160
|
"""Update progress statistics"""
|
|
73
161
|
for key, value in kwargs.items():
|
|
74
|
-
if hasattr(self.
|
|
75
|
-
|
|
76
|
-
|
|
162
|
+
if hasattr(self.op_id_to_stats[op_id], key):
|
|
163
|
+
if key != "total_cost":
|
|
164
|
+
setattr(self.op_id_to_stats[op_id], key, value)
|
|
165
|
+
else:
|
|
166
|
+
self.op_id_to_stats[op_id].total_cost += value
|
|
167
|
+
self.op_id_to_stats[op_id].memory_usage_mb = get_memory_usage()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class MockProgressManager(ProgressManager):
|
|
171
|
+
"""Mock progress manager for testing purposes"""
|
|
172
|
+
|
|
173
|
+
def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
def add_task(self, op_id: str, op_str: str, total: int):
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
def start(self):
|
|
180
|
+
pass
|
|
77
181
|
|
|
78
|
-
|
|
182
|
+
def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
def finish(self):
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
class PZProgressManager(ProgressManager):
|
|
79
189
|
"""Progress manager for command line interface using rich"""
|
|
80
190
|
|
|
81
|
-
def __init__(self):
|
|
82
|
-
super().__init__()
|
|
191
|
+
def __init__(self, plan: PhysicalPlan, num_samples: int | None = None):
|
|
192
|
+
super().__init__(plan, num_samples)
|
|
83
193
|
self.console = Console()
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
194
|
+
|
|
195
|
+
def add_task(self, op_id: str, op_str: str, total: int):
|
|
196
|
+
"""Add a new task to the progress bar"""
|
|
197
|
+
task = self.progress.add_task(
|
|
198
|
+
f"[blue]{op_str}",
|
|
199
|
+
total=total,
|
|
200
|
+
cost=0.0,
|
|
201
|
+
success=0,
|
|
202
|
+
failed=0,
|
|
203
|
+
memory=0.0,
|
|
204
|
+
recent="",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# store the mapping of operator ID to task ID
|
|
208
|
+
self.op_id_to_task[op_id] = task
|
|
209
|
+
|
|
210
|
+
# initialize the stats for this operation
|
|
211
|
+
self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
|
|
212
|
+
|
|
213
|
+
def start(self):
|
|
214
|
+
# print a newline before starting to separate from previous output
|
|
215
|
+
print()
|
|
216
|
+
|
|
217
|
+
# set start time
|
|
218
|
+
self.start_time = time.time()
|
|
219
|
+
|
|
220
|
+
# start progress bar
|
|
221
|
+
self.progress.start()
|
|
222
|
+
|
|
223
|
+
def incr(self, op_id: str, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
224
|
+
# get the task for the given operation
|
|
225
|
+
task = self.op_id_to_task.get(op_id)
|
|
226
|
+
|
|
227
|
+
# update statistics with any additional keyword arguments
|
|
228
|
+
if kwargs != {}:
|
|
229
|
+
self.update_stats(op_id, **kwargs)
|
|
230
|
+
|
|
231
|
+
# update progress bar and recent text in one update
|
|
232
|
+
if display_text is not None:
|
|
233
|
+
self.op_id_to_stats[op_id].recent_text = display_text
|
|
234
|
+
|
|
235
|
+
# if num_outputs is not 1, update the downstream operators' progress bar total for any
|
|
236
|
+
# operator which is not an AggregateOp or LimitScanOp
|
|
237
|
+
delta = num_outputs - 1
|
|
238
|
+
if delta != 0:
|
|
239
|
+
next_op = self.op_id_to_next_op[op_id]
|
|
240
|
+
while next_op is not None:
|
|
241
|
+
if not isinstance(next_op, (AggregateOp, LimitScanOp)):
|
|
242
|
+
next_op_id = next_op.get_op_id()
|
|
243
|
+
next_task = self.op_id_to_task[next_op_id]
|
|
244
|
+
self.progress.update(next_task, total=self.get_task_total(next_op_id) + delta)
|
|
245
|
+
|
|
246
|
+
next_op = self.op_id_to_next_op[next_op_id]
|
|
247
|
+
|
|
248
|
+
# advance the progress bar for this task
|
|
249
|
+
self.progress.update(
|
|
250
|
+
task,
|
|
251
|
+
advance=1,
|
|
252
|
+
description=f"[bold blue]{self.get_task_description(op_id)}",
|
|
253
|
+
cost=self.op_id_to_stats[op_id].total_cost,
|
|
254
|
+
success=self.op_id_to_stats[op_id].success_count,
|
|
255
|
+
failed=self.op_id_to_stats[op_id].failure_count,
|
|
256
|
+
memory=get_memory_usage(),
|
|
257
|
+
recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
|
|
258
|
+
refresh=True,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def finish(self):
|
|
262
|
+
self.progress.stop()
|
|
263
|
+
|
|
264
|
+
# compute total cost, success, and failure
|
|
265
|
+
total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
|
|
266
|
+
# success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
|
|
267
|
+
# failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
|
|
268
|
+
|
|
269
|
+
# Print final stats on new lines after progress display
|
|
270
|
+
print(f"Total time: {time.time() - self.start_time:.2f}s")
|
|
271
|
+
print(f"Total cost: ${total_cost:.4f}")
|
|
272
|
+
# print(f"Success rate: {success_count}/{success_count + failure_count}")
|
|
273
|
+
|
|
274
|
+
class PZSentinelProgressManager(ProgressManager):
|
|
275
|
+
def __init__(self, plan: SentinelPlan, sample_budget: int):
|
|
276
|
+
# overall progress bar
|
|
277
|
+
self.overall_progress = RichProgress(
|
|
87
278
|
SpinnerColumn(),
|
|
88
|
-
TextColumn("
|
|
279
|
+
TextColumn("{task.description}"), # TODO: fixed string?
|
|
89
280
|
BarColumn(),
|
|
90
281
|
TaskProgressColumn(),
|
|
282
|
+
MofNCompleteColumn(),
|
|
91
283
|
TimeElapsedColumn(),
|
|
92
284
|
TimeRemainingColumn(),
|
|
93
|
-
|
|
94
|
-
#TextColumn("[green]Success: {task.fields[success]}"),
|
|
95
|
-
#TextColumn("[red]Failed: {task.fields[failed]}"),
|
|
96
|
-
TextColumn("[cyan]Mem: {task.fields[memory]:.1f}MB"),
|
|
285
|
+
TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
|
|
97
286
|
TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
|
|
98
287
|
refresh_per_second=10,
|
|
99
288
|
expand=True, # Use full width
|
|
100
289
|
)
|
|
101
|
-
self.
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
290
|
+
self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
|
|
291
|
+
|
|
292
|
+
# logical operator progress bars
|
|
293
|
+
self.op_progress = RichProgress(
|
|
294
|
+
SpinnerColumn(),
|
|
295
|
+
"{task.description}",
|
|
296
|
+
BarColumn(),
|
|
297
|
+
TaskProgressColumn(),
|
|
298
|
+
MofNCompleteColumn(),
|
|
299
|
+
TextColumn("[green]Cost: ${task.fields[cost]:.4f}"),
|
|
300
|
+
TextColumn("\n[white]{task.fields[recent]}"), # Recent text on new line
|
|
301
|
+
refresh_per_second=10,
|
|
302
|
+
expand=True, # Use full width
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# organize progress bars into nice display
|
|
306
|
+
self.progress_table = Table.grid()
|
|
307
|
+
self.progress_table.add_row(
|
|
308
|
+
Panel.fit(self.op_progress, title="[b]Sample Allocation", border_style="red", padding=(1, 2)),
|
|
309
|
+
)
|
|
310
|
+
self.progress_table.add_row(
|
|
311
|
+
Panel.fit(
|
|
312
|
+
self.overall_progress, title="Optimization Progress", border_style="green", padding=(2, 2)
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
self.live_display = Live(self.progress_table, refresh_per_second=10)
|
|
316
|
+
|
|
317
|
+
# initialize mapping from op_id --> ProgressStats
|
|
318
|
+
self.op_id_to_stats: dict[str, ProgressStats] = {}
|
|
319
|
+
|
|
320
|
+
# initialize mapping from op_id --> task
|
|
321
|
+
self.op_id_to_task = {}
|
|
322
|
+
|
|
323
|
+
# initialize start time
|
|
324
|
+
self.start_time = None
|
|
325
|
+
|
|
326
|
+
# add a task to the progress manager for each operator in the plan
|
|
327
|
+
for logical_op_id, op_set in plan:
|
|
328
|
+
physical_op = op_set[0]
|
|
329
|
+
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
330
|
+
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
331
|
+
op_name = "LLMConvert" if is_llm_convert else "LLMFilter" if is_llm_filter else physical_op.op_name()
|
|
332
|
+
op_str = f"{op_name} ({logical_op_id})"
|
|
333
|
+
total = sample_budget if self._is_llm_op(op_set[0]) else 0
|
|
334
|
+
self.add_task(logical_op_id, op_str, total)
|
|
335
|
+
|
|
336
|
+
self.console = Console()
|
|
337
|
+
|
|
338
|
+
def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
|
|
339
|
+
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
340
|
+
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
341
|
+
is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
|
|
342
|
+
return is_llm_convert or is_llm_filter or is_llm_retrieve
|
|
343
|
+
|
|
344
|
+
def get_task_description(self, op_id: str) -> str:
|
|
345
|
+
"""Return the current description for the given task."""
|
|
346
|
+
task = self.op_id_to_task[op_id]
|
|
347
|
+
return self.op_progress._tasks[task].description
|
|
348
|
+
|
|
349
|
+
def add_task(self, op_id: str, op_str: str, total: int):
|
|
350
|
+
"""Add a new task to the op progress bars"""
|
|
351
|
+
task = self.op_progress.add_task(
|
|
352
|
+
f"[blue]{op_str}",
|
|
109
353
|
total=total,
|
|
110
354
|
cost=0.0,
|
|
111
355
|
success=0,
|
|
112
356
|
failed=0,
|
|
113
357
|
memory=0.0,
|
|
114
|
-
recent=""
|
|
358
|
+
recent="",
|
|
115
359
|
)
|
|
116
|
-
|
|
117
|
-
# Start progress bar
|
|
118
|
-
self.progress.start()
|
|
119
|
-
|
|
120
|
-
def update(self, current: int, sample: str | None = None, **kwargs):
|
|
121
|
-
self.update_stats(**kwargs)
|
|
122
|
-
|
|
123
|
-
# Update progress bar and recent text in one update
|
|
124
|
-
if sample:
|
|
125
|
-
self.stats.recent_text = sample
|
|
126
|
-
|
|
127
|
-
self.progress.update(
|
|
128
|
-
self.task_id,
|
|
129
|
-
completed=current,
|
|
130
|
-
description=f"[bold blue]{self.stats.current_operation}",
|
|
131
|
-
cost=self.stats.total_cost,
|
|
132
|
-
success=self.stats.success_count,
|
|
133
|
-
failed=self.stats.failure_count,
|
|
134
|
-
memory=self.stats.memory_usage_mb,
|
|
135
|
-
recent=f"Recent: {self.stats.recent_text}"
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
def finish(self):
|
|
139
|
-
self.progress.stop()
|
|
140
|
-
|
|
141
|
-
# Print final stats on new lines after progress display
|
|
142
|
-
print(f"Total time: {time.time() - self.stats.start_time:.2f}s")
|
|
143
|
-
#print(f"Total cost: ${self.stats.total_cost:.4f}")
|
|
144
|
-
print(f"Success rate: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}")
|
|
145
360
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
361
|
+
# store the mapping of operator ID to task ID
|
|
362
|
+
self.op_id_to_task[op_id] = task
|
|
363
|
+
|
|
364
|
+
# initialize the stats for this operation
|
|
365
|
+
self.op_id_to_stats[op_id] = ProgressStats(start_time=time.time())
|
|
366
|
+
|
|
367
|
+
def start(self):
|
|
368
|
+
# print a newline before starting to separate from previous output
|
|
369
|
+
print()
|
|
370
|
+
|
|
371
|
+
# set start time
|
|
372
|
+
self.start_time = time.time()
|
|
373
|
+
|
|
374
|
+
# start progress bars
|
|
375
|
+
self.live_display.start()
|
|
376
|
+
|
|
377
|
+
def incr(self, op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
|
|
378
|
+
# TODO: (above) organize progress bars into a Live / Table / Panel or something
|
|
379
|
+
# get the task for the given operation
|
|
380
|
+
task = self.op_id_to_task.get(op_id)
|
|
381
|
+
|
|
382
|
+
# update statistics with any additional keyword arguments
|
|
383
|
+
if kwargs != {}:
|
|
384
|
+
self.update_stats(op_id, **kwargs)
|
|
385
|
+
|
|
386
|
+
# update progress bar and recent text in one update
|
|
387
|
+
if display_text is not None:
|
|
388
|
+
self.op_id_to_stats[op_id].recent_text = display_text
|
|
389
|
+
|
|
390
|
+
# advance the op progress bar for this op_id
|
|
391
|
+
self.op_progress.update(
|
|
392
|
+
task,
|
|
393
|
+
advance=num_samples,
|
|
394
|
+
description=f"[bold blue]{self.get_task_description(op_id)}",
|
|
395
|
+
cost=self.op_id_to_stats[op_id].total_cost,
|
|
396
|
+
success=self.op_id_to_stats[op_id].success_count,
|
|
397
|
+
failed=self.op_id_to_stats[op_id].failure_count,
|
|
398
|
+
memory=get_memory_usage(),
|
|
399
|
+
recent=f"{self.op_id_to_stats[op_id].recent_text}" if display_text is not None else "",
|
|
400
|
+
refresh=True,
|
|
164
401
|
)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
402
|
+
|
|
403
|
+
# advance the overall progress bar
|
|
404
|
+
self.overall_progress.update(
|
|
405
|
+
self.overall_task_id,
|
|
406
|
+
advance=num_samples,
|
|
407
|
+
cost=sum(stats.total_cost for _, stats in self.op_id_to_stats.items()),
|
|
408
|
+
refresh=True,
|
|
168
409
|
)
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
self.recent_html
|
|
174
|
-
])
|
|
175
|
-
|
|
176
|
-
def start(self, total: int):
|
|
177
|
-
self.progress_bar.max = total
|
|
178
|
-
display(self.container)
|
|
179
|
-
|
|
180
|
-
def update(self, current: int, sample: str | None = None, **kwargs):
|
|
181
|
-
self.update_stats(**kwargs)
|
|
182
|
-
self.progress_bar.value = current
|
|
183
|
-
|
|
184
|
-
# Update stats display
|
|
185
|
-
# Total Cost: ${self.stats.total_cost:.4f}
|
|
186
|
-
# Success/Total: {self.stats.success_count}/{self.stats.success_count + self.stats.failure_count}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
stats_text = f"""
|
|
190
|
-
<pre>
|
|
191
|
-
Operation: {self.stats.current_operation}
|
|
192
|
-
Time Elapsed: {time.time() - self.stats.start_time:.1f}s
|
|
193
|
-
Memory Usage: {self.stats.memory_usage_mb:.1f}MB
|
|
194
|
-
</pre>
|
|
195
|
-
"""
|
|
196
|
-
self.stats_html.value = stats_text
|
|
197
|
-
|
|
198
|
-
# Update recent text
|
|
199
|
-
if sample:
|
|
200
|
-
self.stats.recent_text = sample
|
|
201
|
-
self.recent_html.value = f"<pre>Recent: {self.stats.recent_text}</pre>"
|
|
202
|
-
|
|
410
|
+
|
|
411
|
+
# force the live display to refresh
|
|
412
|
+
self.live_display.refresh()
|
|
413
|
+
|
|
203
414
|
def finish(self):
|
|
204
|
-
self.
|
|
205
|
-
|
|
206
|
-
#
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
Completed!
|
|
211
|
-
Total Time: {time.time() - self.stats.start_time:.1f}s
|
|
212
|
-
Peak Memory Usage: {self.stats.memory_usage_mb:.1f}MB
|
|
213
|
-
</pre>
|
|
214
|
-
"""
|
|
215
|
-
self.stats_html.value = stats_text
|
|
216
|
-
self.recent_html.value = "<pre>Completed</pre>"
|
|
415
|
+
self.live_display.stop()
|
|
416
|
+
|
|
417
|
+
# compute total cost, success, and failure
|
|
418
|
+
total_cost = sum(stats.total_cost for stats in self.op_id_to_stats.values())
|
|
419
|
+
# success_count = sum(stats.success_count for stats in self.op_id_to_stats.values())
|
|
420
|
+
# failure_count = sum(stats.failure_count for stats in self.op_id_to_stats.values())
|
|
217
421
|
|
|
218
|
-
|
|
422
|
+
# Print final stats on new lines after progress display
|
|
423
|
+
print(f"Total opt. time: {time.time() - self.start_time:.2f}s")
|
|
424
|
+
print(f"Total opt. cost: ${total_cost:.4f}")
|
|
425
|
+
# print(f"Success rate: {success_count}/{success_count + failure_count}")
|
|
426
|
+
|
|
427
|
+
def create_progress_manager(
|
|
428
|
+
plan: PhysicalPlan | SentinelPlan,
|
|
429
|
+
num_samples: int | None = None,
|
|
430
|
+
sample_budget: int | None = None,
|
|
431
|
+
progress: bool = True,
|
|
432
|
+
) -> ProgressManager:
|
|
219
433
|
"""Factory function to create appropriate progress manager based on environment"""
|
|
220
|
-
if
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
434
|
+
if not progress:
|
|
435
|
+
return MockProgressManager(plan, num_samples)
|
|
436
|
+
|
|
437
|
+
if isinstance(plan, SentinelPlan):
|
|
438
|
+
assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
|
|
439
|
+
return PZSentinelProgressManager(plan, sample_budget)
|
|
440
|
+
|
|
441
|
+
return PZProgressManager(plan, num_samples)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
from fuzzywuzzy import fuzz, process
|
|
4
2
|
|
|
5
3
|
|
|
@@ -89,7 +87,7 @@ def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=Tr
|
|
|
89
87
|
return start * 1.0 / index_range, end * 1.0 / index_range
|
|
90
88
|
|
|
91
89
|
|
|
92
|
-
def best_substring_match(query: str, context: str |
|
|
90
|
+
def best_substring_match(query: str, context: str | list[str]):
|
|
93
91
|
# This will extract all substrings of length equal to the query from the string
|
|
94
92
|
candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]
|
|
95
93
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -16,6 +16,7 @@ Requires-Python: >=3.8
|
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: charset-normalizer>=3.3.2
|
|
19
|
+
Requires-Dist: chromadb>=0.6.3
|
|
19
20
|
Requires-Dist: click>=8.1.7
|
|
20
21
|
Requires-Dist: click-aliases>=1.0.4
|
|
21
22
|
Requires-Dist: colorama>=0.4.6
|
|
@@ -54,19 +55,17 @@ Requires-Dist: pypdf>=5.1.0
|
|
|
54
55
|
Requires-Dist: pytest-mock>=3.14.0
|
|
55
56
|
Requires-Dist: python-Levenshtein>=0.25.1
|
|
56
57
|
Requires-Dist: pyyaml>=6.0.1
|
|
58
|
+
Requires-Dist: ragatouille>=0.0.9
|
|
57
59
|
Requires-Dist: requests>=2.25
|
|
58
|
-
Requires-Dist: requests-html>=0.10.0
|
|
59
60
|
Requires-Dist: ruff>=0.9.0
|
|
60
|
-
Requires-Dist: scikit-learn>=1.5.2
|
|
61
|
-
Requires-Dist: scipy>=1.9.0
|
|
62
61
|
Requires-Dist: setuptools>=70.1.1
|
|
63
62
|
Requires-Dist: tabulate>=0.9.0
|
|
64
|
-
Requires-Dist: tenacity>=8.2.3
|
|
65
63
|
Requires-Dist: together>=1.3.1
|
|
66
64
|
Requires-Dist: tqdm~=4.66.1
|
|
67
|
-
Requires-Dist: transformers
|
|
68
|
-
Requires-Dist:
|
|
69
|
-
Requires-Dist:
|
|
65
|
+
Requires-Dist: transformers<4.50.0,>=4.41.3
|
|
66
|
+
Requires-Dist: rich[jupyter]>=13.9.2
|
|
67
|
+
Requires-Dist: voyager>=2.0.9
|
|
68
|
+
Dynamic: license-file
|
|
70
69
|
|
|
71
70
|

|
|
72
71
|
|
|
@@ -163,3 +162,14 @@ Now you can run the simple test program with:
|
|
|
163
162
|
```bash
|
|
164
163
|
$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
|
|
165
164
|
```
|
|
165
|
+
|
|
166
|
+
### Citation
|
|
167
|
+
If you would like to cite our work, please use the following citation:
|
|
168
|
+
```
|
|
169
|
+
@inproceedings{palimpzestCIDR,
|
|
170
|
+
title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
|
|
171
|
+
author={Liu, Chunwei and Russo, Matthew and Cafarella, Michael and Cao, Lei and Chen, Peter Baile and Chen, Zui and Franklin, Michael and Kraska, Tim and Madden, Samuel and Shahout, Rana and Vitagliano, Gerardo},
|
|
172
|
+
booktitle = {Proceedings of the {{Conference}} on {{Innovative Database Research}} ({{CIDR}})},
|
|
173
|
+
date = 2025,
|
|
174
|
+
}
|
|
175
|
+
```
|