palimpzest 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -595,7 +595,7 @@ class Dataset:
595
595
 
596
596
  return QueryProcessorFactory.create_and_run_processor(self, config)
597
597
 
598
- def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, config: QueryProcessorConfig | None = None, **kwargs):
598
+ def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
599
599
  """Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
600
600
  # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
601
601
  from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
@@ -2,16 +2,19 @@
2
2
  import logging
3
3
 
4
4
  import numpy as np
5
+ from chromadb.api.models.Collection import Collection
5
6
 
6
7
  from palimpzest.core.data.dataset import Dataset
7
8
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
8
- from palimpzest.core.models import OperatorStats, RecordOpStats, SentinelPlanStats
9
+ from palimpzest.core.models import OperatorCostEstimates, OperatorStats, RecordOpStats, SentinelPlanStats
9
10
  from palimpzest.policy import Policy
10
11
  from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
11
12
  from palimpzest.query.operators.aggregate import AggregateOp
12
- from palimpzest.query.operators.filter import FilterOp
13
+ from palimpzest.query.operators.convert import LLMConvert
14
+ from palimpzest.query.operators.filter import FilterOp, LLMFilter
13
15
  from palimpzest.query.operators.join import JoinOp
14
16
  from palimpzest.query.operators.physical import PhysicalOperator
17
+ from palimpzest.query.operators.retrieve import RetrieveOp
15
18
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
16
19
  from palimpzest.query.optimizer.plan import SentinelPlan
17
20
  from palimpzest.utils.progress import create_progress_manager
@@ -55,6 +58,17 @@ class OpFrontier:
55
58
  # store the prior beliefs on operator performance (if provided)
56
59
  self.priors = priors
57
60
 
61
+ # boolean indication of the type of operator in this OpFrontier
62
+ sample_op = op_set[0]
63
+ self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
64
+ self.is_filter_op = isinstance(sample_op, FilterOp)
65
+ self.is_aggregate_op = isinstance(sample_op, AggregateOp)
66
+ self.is_llm_join = isinstance(sample_op, JoinOp)
67
+ is_llm_convert = isinstance(sample_op, LLMConvert)
68
+ is_llm_filter = isinstance(sample_op, LLMFilter)
69
+ is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
70
+ self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
71
+
58
72
  # get order in which we will sample physical operators for this logical operator
59
73
  sample_op_indices = self._get_op_index_order(op_set, seed)
60
74
 
@@ -68,13 +82,6 @@ class OpFrontier:
68
82
  self.full_op_id_to_sources_not_processed = {op.get_full_op_id(): source_indices for op in op_set}
69
83
  self.max_inputs = len(source_indices)
70
84
 
71
- # boolean indication of the type of operator in this OpFrontier
72
- sample_op = op_set[0]
73
- self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
74
- self.is_filter_op = isinstance(sample_op, FilterOp)
75
- self.is_aggregate_op = isinstance(sample_op, AggregateOp)
76
- self.is_llm_join = isinstance(sample_op, JoinOp)
77
-
78
85
  # set the initial inputs for this logical operator; we maintain a mapping from source_unique_logical_op_id --> source_indices --> input;
79
86
  # for each unique source and (tuple of) source indices, we store its output, which is an input to this operator
80
87
  # for scan operators, we use the default name "source" since these operators have no source
@@ -149,16 +156,44 @@ class OpFrontier:
149
156
 
150
157
  return op_id_to_pareto_distance
151
158
 
159
+ def _compute_naive_priors(self, op_set: list[PhysicalOperator]) -> dict[str, dict[str, float]]:
160
+ naive_priors = {}
161
+ for op in op_set:
162
+ # use naive cost estimates with dummy source estimates to compute priors
163
+ source_op_estimates = OperatorCostEstimates(quality=1.0, cost_per_record=0.0, time_per_record=0.0, cardinality=100)
164
+ op_estimates = (
165
+ op.naive_cost_estimates(source_op_estimates, source_op_estimates)
166
+ if self.is_llm_join
167
+ else op.naive_cost_estimates(source_op_estimates)
168
+ )
169
+
170
+ # get op_id for this operator
171
+ op_id = op.get_op_id()
172
+
173
+ # set the naive quality, cost, and time priors for this operator
174
+ naive_priors[op_id] = {
175
+ "quality": op_estimates.quality,
176
+ "cost": op_estimates.cost_per_record,
177
+ "time": op_estimates.time_per_record,
178
+ }
179
+
180
+ return naive_priors
181
+
152
182
  def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
153
183
  """
154
184
  Returns a list of indices for the operators in the op_set.
155
185
  """
156
- if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
186
+ # if this is not an llm-operator, we simply return the indices in random order
187
+ if not self.is_llm_op:
157
188
  rng = np.random.default_rng(seed=seed)
158
189
  op_indices = np.arange(len(op_set))
159
190
  rng.shuffle(op_indices)
160
191
  return op_indices
161
192
 
193
+ # if this is an llm-operator, but we do not have priors, we first compute naive priors
194
+ if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
195
+ self.priors = self._compute_naive_priors(op_set)
196
+
162
197
  # NOTE: self.priors is a dictionary with format:
163
198
  # {op_id: {"quality": quality, "cost": cost, "time": time}}
164
199
 
@@ -215,7 +250,7 @@ class OpFrontier:
215
250
  op_source_indices_pairs = []
216
251
 
217
252
  # if this operator is not being optimized: we don't request inputs, but simply process what we are given / told to (in the case of scans)
218
- if not self.is_llm_join and len(self.frontier_ops) == 1:
253
+ if not self.is_llm_op and len(self.frontier_ops) == 1:
219
254
  return [(self.frontier_ops[0], None)]
220
255
 
221
256
  # otherwise, sample (operator, source_indices) pairs
@@ -255,16 +290,6 @@ class OpFrontier:
255
290
  all_inputs.extend(inputs)
256
291
  return [(op, tuple(), all_inputs)]
257
292
 
258
- # if this is an un-optimized (non-scan, non-join) operator, flatten inputs and run on each one
259
- elif not self.is_scan_op and not self.is_llm_join and len(self.frontier_ops) == 1:
260
- op_inputs = []
261
- op = self.frontier_ops[0]
262
- for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
263
- for source_indices, inputs in source_indices_to_inputs.items():
264
- for input in inputs:
265
- op_inputs.append((op, source_indices, input))
266
- return op_inputs
267
-
268
293
  ### for optimized operators
269
294
  # get the list of (op, source_indices) pairs which this operator needs to execute
270
295
  op_source_indices_pairs = self._get_op_source_indices_pairs()
@@ -40,8 +40,8 @@ class QueryProcessorConfig(BaseModel):
40
40
  use_final_op_quality: bool = Field(default=False)
41
41
 
42
42
  # sentinel optimization flags
43
- k: int = Field(default=5)
44
- j: int = Field(default=5)
43
+ k: int = Field(default=6)
44
+ j: int = Field(default=4)
45
45
  sample_budget: int = Field(default=100)
46
46
  seed: int = Field(default=42)
47
47
  exp_name: str | None = Field(default=None)
@@ -114,8 +114,8 @@ class QueryProcessor:
114
114
  execution_stats = ExecutionStats(execution_id=self.execution_id())
115
115
  execution_stats.start()
116
116
 
117
- # if the user provides a train_dataset or validator, we perform optimization
118
- if self.train_dataset is not None or self.validator is not None:
117
+ # if the user provides a validator, we perform optimization
118
+ if self.validator is not None:
119
119
  # create sentinel plan
120
120
  sentinel_plan = self._create_sentinel_plan(self.train_dataset)
121
121
 
@@ -62,13 +62,17 @@ class QueryProcessorFactory:
62
62
  print("WARNING: Both `progress` and `verbose` are set to True, but only one can be True at a time; defaulting to `progress=True`")
63
63
  config.verbose = False
64
64
 
65
+ # if the user provides a training dataset, but no validator, create a default validator
66
+ if train_dataset is not None and validator is None:
67
+ validator = Validator()
68
+ logger.info("No validator provided; using default Validator")
69
+
65
70
  # boolean flag for whether we're performing optimization or not
66
- optimization = train_dataset is not None or validator is not None
67
- val_based_opt = train_dataset is None and validator is not None
71
+ optimization = validator is not None
68
72
 
69
73
  # handle "auto" default for sentinel execution strategies
70
74
  if config.sentinel_execution_strategy == "auto":
71
- config.sentinel_execution_strategy = ("validator" if val_based_opt else "mab") if optimization else None
75
+ config.sentinel_execution_strategy = "mab" if optimization else None
72
76
 
73
77
  # convert the config values for processing, execution, and optimization strategies to enums
74
78
  config = cls._normalize_strategies(config)
@@ -87,7 +91,7 @@ class QueryProcessorFactory:
87
91
  # set the final set of available models in the config
88
92
  config.available_models = available_models
89
93
 
90
- return config
94
+ return config, validator
91
95
 
92
96
  @classmethod
93
97
  def _create_optimizer(cls, config: QueryProcessorConfig) -> Optimizer:
@@ -143,7 +147,7 @@ class QueryProcessorFactory:
143
147
  config = QueryProcessorConfig()
144
148
 
145
149
  # apply any additional keyword arguments to the config and validate its contents
146
- config = cls._config_validation_and_normalization(config, train_dataset, validator)
150
+ config, validator = cls._config_validation_and_normalization(config, train_dataset, validator)
147
151
 
148
152
  # create the optimizer, execution strateg(ies), and processor
149
153
  optimizer = cls._create_optimizer(config)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.8.1
3
+ Version: 0.8.2
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -9,7 +9,7 @@ palimpzest/core/models.py,sha256=fLO4T7x0njNeEbUpbhJm9cdnBva0y0Zw5WGBGdzdS_I,424
9
9
  palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  palimpzest/core/data/context.py,sha256=x1xYyu9qW65dvtK_XayIfv_CgsCEPW6Qe0DTiSf9sjU,16207
11
11
  palimpzest/core/data/context_manager.py,sha256=8hAKWD2jhFZgghTu7AYgjkvKDsJUPVxq8g4nG0HWvfo,6150
12
- palimpzest/core/data/dataset.py,sha256=ShK8KTfghCfuS98oTeAh6UFuvBAllQf1XbGJv91dw1w,28178
12
+ palimpzest/core/data/dataset.py,sha256=M7SxPXzHsfj-ljy_P3ckcJNqGf4RwNxtZI02q_tmL2M,28178
13
13
  palimpzest/core/data/index_dataset.py,sha256=adO67DgzHhA4lBME0-h4SjXfdz9UcNMSDGXTpUdKbgE,1929
14
14
  palimpzest/core/data/iter_dataset.py,sha256=u7eZNWWT84rH_D8LNIuq0NAnm2roX81ifKTYp-hwY7g,20512
15
15
  palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,7 +37,7 @@ palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
37
37
  palimpzest/query/execution/all_sample_execution_strategy.py,sha256=3n2hl8m-WFWIu-a8DiSVsGkz4ej3yB7mSdFR0jsiwAU,14366
38
38
  palimpzest/query/execution/execution_strategy.py,sha256=KwBJbWOBOOPBiWRm3ypHcAQiWbCsvtW6UnVU4tHkYz8,18905
39
39
  palimpzest/query/execution/execution_strategy_type.py,sha256=vRQBPCQN5_aoyD3TLIeW3VPo15mqF-5RBvEXkENz9FE,987
40
- palimpzest/query/execution/mab_execution_strategy.py,sha256=LY1JlbYMsnJHCtYjaJ6iklojBqXc2B4KS62lobPFNz0,42341
40
+ palimpzest/query/execution/mab_execution_strategy.py,sha256=paVfB8lqNyUuISqfhkTd6RqOZqpyVty1EAN1sZz7erA,43554
41
41
  palimpzest/query/execution/parallel_execution_strategy.py,sha256=Gn5hB5XddX2jCkxx6d7O-DmitK6fbuwBFnnyKhnGYEw,15706
42
42
  palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=1eo-Z9G3u92_PjoSX8HmO3D3phYgA8f0Actbgd1-oKY,16247
43
43
  palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -70,9 +70,9 @@ palimpzest/query/optimizer/primitives.py,sha256=jMMVq37y1tWiPU1lSSKQP9OP-mzkpSxS
70
70
  palimpzest/query/optimizer/rules.py,sha256=9AsuVjhiZUc0snQPNhIqeyKpmqFsSv7e-v6BEbp9CDw,43315
71
71
  palimpzest/query/optimizer/tasks.py,sha256=DJcKDNbVJox61rnTW0HgT1PtxGx2P_NiLvNroXie-Lg,29509
72
72
  palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- palimpzest/query/processor/config.py,sha256=w2nkb6UXHw5HOvRZO1KBgb3y_tE2On9gmo_G7YGvnf8,2366
74
- palimpzest/query/processor/query_processor.py,sha256=W01-2FocN1Jsv58gmEo5ALTIcpLt7D0dmI8kghSCdBk,6291
75
- palimpzest/query/processor/query_processor_factory.py,sha256=WTp58KrtVzkAaO_cLgnlkr6o7BJk76cvmGcOmsTy_Ww,7896
73
+ palimpzest/query/processor/config.py,sha256=vHVsgeBnKigacO0QA7bLf5q8pJhFWA2j9-p_no2bmYo,2366
74
+ palimpzest/query/processor/query_processor.py,sha256=T4ffPbnOX23G8FDITzmM7Iw7DUEDWIHnwl8XLYllgjg,6240
75
+ palimpzest/query/processor/query_processor_factory.py,sha256=6w9R1Y8AOV22X8MUf7g2G5Qb15BGEZAXQKbCQJafWJ0,8048
76
76
  palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  palimpzest/schemabuilder/schema_builder.py,sha256=QraGp66dcD-ej6Y2mER40o86G9JqlBkL7swkJzjUAIY,7968
78
78
  palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
@@ -88,8 +88,8 @@ palimpzest/utils/progress.py,sha256=7gucyZr82udMDZitrrkAOSKHZVljE3R2wv9nf5gA5TM,
88
88
  palimpzest/utils/udfs.py,sha256=LjHic54B1az-rKgNLur0wOpaz2ko_UodjLEJrazkxvY,1854
89
89
  palimpzest/validator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
90
  palimpzest/validator/validator.py,sha256=J2tGvJqfg6v5lOQDYYaqAa9d37uVHBrqkNs-a8d1Ic0,16365
91
- palimpzest-0.8.1.dist-info/licenses/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
92
- palimpzest-0.8.1.dist-info/METADATA,sha256=iA31ZJnmE0bVBJHjE3scrTM7xeYnhAbfu8FatcgPYzU,7286
93
- palimpzest-0.8.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
- palimpzest-0.8.1.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
95
- palimpzest-0.8.1.dist-info/RECORD,,
91
+ palimpzest-0.8.2.dist-info/licenses/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
92
+ palimpzest-0.8.2.dist-info/METADATA,sha256=bDa2zFfJr_v4Ef6fzq3SCALSoXoXc0uPnefnmVbAzTA,7286
93
+ palimpzest-0.8.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ palimpzest-0.8.2.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
95
+ palimpzest-0.8.2.dist-info/RECORD,,