palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
  57. palimpzest-0.7.0.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.3.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,17 @@
1
+ import logging
1
2
  from abc import abstractmethod
2
- from concurrent.futures import ThreadPoolExecutor
3
3
 
4
- from palimpzest.core.data.dataclasses import PlanStats, RecordOpStats
4
+ from palimpzest.core.data.dataclasses import PlanStats
5
5
  from palimpzest.core.data.datareaders import DataReader
6
6
  from palimpzest.core.elements.records import DataRecord, DataRecordCollection
7
7
  from palimpzest.policy import Policy
8
- from palimpzest.query.optimizer.cost_model import CostModel
8
+ from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
9
9
  from palimpzest.query.optimizer.optimizer import Optimizer
10
- from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
11
- from palimpzest.query.optimizer.plan import PhysicalPlan
12
- from palimpzest.query.processor.config import QueryProcessorConfig
13
- from palimpzest.sets import Dataset, Set
10
+ from palimpzest.sets import Dataset
14
11
  from palimpzest.utils.hash_helpers import hash_for_id
15
12
  from palimpzest.utils.model_helpers import get_models
16
13
 
14
+ logger = logging.getLogger(__name__)
17
15
 
18
16
  class QueryProcessor:
19
17
  """
@@ -25,9 +23,18 @@ class QueryProcessor:
25
23
  def __init__(
26
24
  self,
27
25
  dataset: Dataset,
28
- optimizer: Optimizer = None,
29
- config: QueryProcessorConfig = None,
30
- *args,
26
+ optimizer: Optimizer,
27
+ execution_strategy: ExecutionStrategy,
28
+ sentinel_execution_strategy: SentinelExecutionStrategy | None,
29
+ num_samples: int | None = None,
30
+ val_datasource: DataReader | None = None,
31
+ scan_start_idx: int = 0,
32
+ cache: bool = False,
33
+ verbose: bool = False,
34
+ progress: bool = True,
35
+ max_workers: int | None = None,
36
+ policy: Policy | None = None,
37
+ available_models: list[str] | None = None,
31
38
  **kwargs,
32
39
  ):
33
40
  """
@@ -35,48 +42,32 @@ class QueryProcessor:
35
42
 
36
43
  Args:
37
44
  dataset: Dataset to process
38
- optimizer: Custom optimizer (optional)
39
- execution_engine: Custom execution engine (optional)
40
- config: Configuration dictionary for default components
45
+ TODO
41
46
  """
42
- assert config is not None, "QueryProcessorConfig is required for QueryProcessor"
43
-
44
- self.config = config or QueryProcessorConfig()
45
47
  self.dataset = dataset
46
- self.datareader = self._get_datareader(self.dataset)
47
- self.num_samples = self.config.num_samples
48
- self.val_datasource = self.config.val_datasource
49
- self.scan_start_idx = self.config.scan_start_idx
50
- self.nocache = self.config.nocache
51
- self.verbose = self.config.verbose
52
- self.max_workers = self.config.max_workers
53
- self.num_workers_per_plan = self.config.num_workers_per_plan
54
- self.min_plans = self.config.min_plans
48
+ self.optimizer = optimizer
49
+ self.execution_strategy = execution_strategy
50
+ self.sentinel_execution_strategy = sentinel_execution_strategy
55
51
 
56
- self.policy = self.config.policy
52
+ self.num_samples = num_samples
53
+ self.val_datasource = val_datasource
54
+ self.scan_start_idx = scan_start_idx
55
+ self.cache = cache
56
+ self.verbose = verbose
57
+ self.progress = progress
58
+ self.max_workers = max_workers
57
59
 
58
- self.available_models = self.config.available_models
60
+ self.policy = policy
61
+
62
+ self.available_models = available_models
59
63
  if self.available_models is None or len(self.available_models) == 0:
60
64
  self.available_models = get_models(include_vision=True)
61
65
 
62
66
  if self.verbose:
63
67
  print("Available models: ", self.available_models)
64
68
 
65
- # Initialize optimizer and execution engine
66
- # TODO: config currently has optimizer field which is string.
67
- # In this case, we only use the initialized optimizer. Later after we split the config to multiple configs, there won't be such confusion.
68
- assert optimizer is not None, "Optimizer is required. Please use QueryProcessorFactory.create_processor() to initialize a QueryProcessor."
69
- self.optimizer = optimizer
70
-
71
- def _get_datareader(self, dataset: Set | DataReader) -> DataReader:
72
- """
73
- Gets the DataReader for the given dataset.
74
- """
75
- # iterate until we reach DataReader
76
- while isinstance(dataset, Set):
77
- dataset = dataset._source
78
-
79
- return dataset
69
+ logger.info(f"Initialized QueryProcessor {self.__class__.__name__}")
70
+ logger.debug(f"QueryProcessor initialized with config: {self.__dict__}")
80
71
 
81
72
  def execution_id(self) -> str:
82
73
  """
@@ -89,177 +80,18 @@ class QueryProcessor:
89
80
 
90
81
  return hash_for_id(id_str)
91
82
 
92
- def get_max_quality_plan_id(self, plans: list[PhysicalPlan]) -> str:
93
- """
94
- Return the plan_id for the plan with the highest quality in the list of plans.
95
- """
96
- max_quality_plan_id, max_quality = None, -1
97
- for plan in plans:
98
- if plan.quality > max_quality or max_quality_plan_id is None:
99
- max_quality_plan_id = plan.plan_id
100
- max_quality = plan.quality
101
-
102
- return max_quality_plan_id
103
-
104
- def aggregate_plan_stats(self, plan_stats: list[PlanStats]) -> dict[str, PlanStats]:
105
- """
106
- Aggregate a list of plan stats into a dictionary mapping plan_id --> cumulative plan stats.
107
-
108
- NOTE: we make the assumption that the same plan cannot be run more than once in parallel,
109
- i.e. each plan stats object for an individual plan comes from two different (sequential)
110
- periods in time. Thus, PlanStats' total_plan_time(s) can be summed.
111
- """
112
- agg_plan_stats = {}
113
- for ps in plan_stats:
114
- if ps.plan_id in agg_plan_stats:
115
- agg_plan_stats[ps.plan_id] += ps
116
- else:
117
- agg_plan_stats[ps.plan_id] = ps
118
-
119
- return agg_plan_stats
120
-
121
- def execute_plans(
122
- self, plans: list[PhysicalPlan], max_quality_plan_id: str, num_samples: int | float = float("inf")
123
- ):
124
- """
125
- Execute a given list of plans for num_samples records each. Plans are executed in parallel.
126
- If any workers are unused, then additional workers are distributed evenly among plans.
127
- """
128
- # compute number of plans
129
- num_plans = len(plans)
130
-
131
- # set plan_parallel_workers and workers_per_plan;
132
- # - plan_parallel_workers controls how many plans are executed in parallel
133
- # - workers_per_plan controls how many threads are assigned to executing each plan
134
- plan_parallel_workers, workers_per_plan = None, None
135
- if self.max_workers <= num_plans:
136
- plan_parallel_workers = self.max_workers
137
- workers_per_plan = [1 for _ in range(num_plans)]
138
- else:
139
- plan_parallel_workers = num_plans
140
- workers_per_plan = [(self.max_workers // num_plans) for _ in range(num_plans)]
141
- idx = 0
142
- while sum(workers_per_plan) < self.max_workers:
143
- workers_per_plan[idx] += 1
144
- idx += 1
145
-
146
- with ThreadPoolExecutor(max_workers=plan_parallel_workers) as executor:
147
- results = list(executor.map(lambda x: self.execute_plan(**x),
148
- [{"plan": plan,
149
- "num_samples": num_samples,
150
- "plan_workers": plan_workers}
151
- for plan, plan_workers in zip(plans, workers_per_plan)],
152
- )
153
- )
154
- # results = list(map(lambda x: self.execute_plan(**x),
155
- # [{"plan": plan,
156
- # "num_samples": num_samples,
157
- # "plan_workers": 1}
158
- # for plan in plans],
159
- # )
160
- # )
161
- # split results into per-plan records and plan stats
162
- all_records, all_plan_stats = zip(*results)
163
-
164
- # process results to get sample execution data and sentinel plan stats
165
- all_sample_execution_data, return_records = [], []
166
- for records, plan_stats, plan in zip(all_records, all_plan_stats, plans):
167
- # aggregate sentinel est. data
168
- for operator_stats in plan_stats.operator_stats.values():
169
- all_sample_execution_data.extend(operator_stats.record_op_stats_lst)
170
-
171
- # if this is the max quality plan for this set of plans, return its results for these records
172
- if plan.plan_id == max_quality_plan_id:
173
- return_records = records
174
-
175
- return all_sample_execution_data, return_records, all_plan_stats
176
-
177
- def _execute_best_plan(
178
- self,
179
- dataset: Dataset,
180
- policy: Policy,
181
- optimizer: Optimizer,
182
- execution_data: list[RecordOpStats] | None = None,
183
- ) -> tuple[list[DataRecord], list[PlanStats]]:
83
+ def _execute_best_plan(self, dataset: Dataset, optimizer: Optimizer) -> tuple[list[DataRecord], list[PlanStats]]:
184
84
  # get the optimal plan according to the optimizer
185
- plans = optimizer.optimize(dataset, policy)
85
+ plans = optimizer.optimize(dataset)
186
86
  final_plan = plans[0]
87
+
187
88
  # execute the plan
188
- # TODO: for some reason this is not picking up change to self.max_workers from PipelinedParallelPlanExecutor.__init__()
189
- records, plan_stats = self.execute_plan(
190
- plan=final_plan,
191
- plan_workers=self.max_workers,
192
- )
89
+ records, plan_stats = self.execution_strategy.execute_plan(plan=final_plan)
193
90
 
194
91
  # return the output records and plan stats
195
92
  return records, [plan_stats]
196
-
197
- def _execute_with_strategy(
198
- self,
199
- dataset: Dataset,
200
- policy: Policy,
201
- optimizer: Optimizer,
202
- execution_data: list[RecordOpStats] | None = None,
203
- ) -> tuple[list[DataRecord], list[PlanStats]]:
204
- records, plan_stats = [], []
205
- if optimizer.optimization_strategy_type == OptimizationStrategyType.CONFIDENCE_INTERVAL:
206
- records, plan_stats = self._execute_confidence_interval_strategy(dataset, policy, optimizer, execution_data)
207
- else:
208
- records, plan_stats = self._execute_best_plan(dataset, policy, optimizer, execution_data)
209
- return records, plan_stats
210
-
211
-
212
- def _execute_confidence_interval_strategy(
213
- self,
214
- dataset: Dataset,
215
- policy: Policy,
216
- optimizer: Optimizer,
217
- execution_data: list[RecordOpStats] | None = None,
218
- ) -> tuple[list[DataRecord], list[PlanStats]]:
219
- # initialize output records and plan stats
220
- if execution_data is None:
221
- execution_data = []
222
- records, plan_stats = [], []
223
-
224
- # get the initial set of optimal plans according to the optimizer
225
- plans = optimizer.optimize(dataset, policy)
226
- while len(plans) > 1 and self.scan_start_idx < len(self.datareader):
227
- # identify the plan with the highest quality in the set
228
- max_quality_plan_id = self.get_max_quality_plan_id(plans)
229
-
230
- # execute the set of plans for a fixed number of samples
231
- new_execution_data, new_records, new_plan_stats = self.execute_plans(
232
- list(plans), max_quality_plan_id, self.num_samples
233
- )
234
- records.extend(new_records)
235
- plan_stats.extend(new_plan_stats)
236
-
237
- if self.scan_start_idx + self.num_samples < len(self.datareader):
238
- # update cost model and optimizer
239
- execution_data.extend(new_execution_data)
240
- cost_model = CostModel(sample_execution_data=execution_data)
241
- optimizer.update_cost_model(cost_model)
242
-
243
- # get new set of plans
244
- plans = optimizer.optimize(dataset, policy)
245
-
246
- # update scan start idx
247
- self.scan_start_idx += self.num_samples
248
-
249
- if self.scan_start_idx < len(self.datareader):
250
- # execute final plan until end
251
- final_plan = plans[0]
252
- new_records, new_plan_stats = self.execute_plan(
253
- plan=final_plan,
254
- plan_workers=self.max_workers,
255
- )
256
- records.extend(new_records)
257
- plan_stats.append(new_plan_stats)
258
-
259
- # return the final set of records and plan stats
260
- return records, plan_stats
261
93
 
262
94
  # TODO: consider to support dry_run.
263
95
  @abstractmethod
264
96
  def execute(self) -> DataRecordCollection:
265
- raise NotImplementedError("Abstract method to be overwritten by sub-classes")
97
+ raise NotImplementedError("Abstract method to be overwritten by sub-classes")
@@ -1,69 +1,123 @@
1
+ import logging
1
2
  from enum import Enum
2
3
 
3
4
  from palimpzest.core.elements.records import DataRecordCollection
4
- from palimpzest.query.execution.execution_strategy import ExecutionStrategyType
5
+ from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
6
+ from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType, SentinelExecutionStrategyType
5
7
  from palimpzest.query.optimizer.cost_model import CostModel
6
8
  from palimpzest.query.optimizer.optimizer import Optimizer
7
- from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
9
+ from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
8
10
  from palimpzest.query.processor.config import QueryProcessorConfig
9
- from palimpzest.query.processor.mab_sentinel_processor import (
10
- MABSentinelPipelinedParallelProcessor,
11
- MABSentinelSequentialSingleThreadProcessor,
12
- )
13
- from palimpzest.query.processor.nosentinel_processor import (
14
- NoSentinelPipelinedParallelProcessor,
15
- NoSentinelPipelinedSingleThreadProcessor,
16
- NoSentinelSequentialSingleThreadProcessor,
17
- )
11
+ from palimpzest.query.processor.processing_strategy_type import ProcessingStrategyType
18
12
  from palimpzest.query.processor.query_processor import QueryProcessor
19
- from palimpzest.query.processor.random_sampling_sentinel_processor import (
20
- RandomSamplingSentinelPipelinedParallelProcessor,
21
- RandomSamplingSentinelSequentialSingleThreadProcessor,
22
- )
23
- from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
24
13
  from palimpzest.sets import Dataset, Set
25
14
  from palimpzest.utils.model_helpers import get_models
26
15
 
16
+ logger = logging.getLogger(__name__)
27
17
 
28
- class ProcessingStrategyType(Enum):
29
- """How to generate and optimize query plans"""
30
- MAB_SENTINEL = "mab_sentinel"
31
- NO_SENTINEL = "nosentinel"
32
- RANDOM_SAMPLING = "random_sampling"
33
- STREAMING = "streaming"
34
- AUTO = "auto"
35
18
 
36
- def convert_to_enum(enum_type: type[Enum], value: str) -> Enum:
37
- if value == "pipelined":
38
- value = "pipelined_single_thread"
39
- value = value.upper().replace('-', '_')
40
- try:
41
- return enum_type[value]
42
- except KeyError as e:
43
- raise ValueError(f"Unsupported {enum_type.__name__}: {value}") from e
19
+ class QueryProcessorFactory:
44
20
 
21
+ @classmethod
22
+ def _convert_to_enum(cls, enum_type: type[Enum], value: str) -> Enum:
23
+ value = value.upper().replace('-', '_')
24
+ try:
25
+ return enum_type[value]
26
+ except KeyError as e:
27
+ raise ValueError(f"Unsupported {enum_type.__name__}: {value}") from e
45
28
 
46
- class QueryProcessorFactory:
47
- PROCESSOR_MAPPING = {
48
- (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.SEQUENTIAL):
49
- NoSentinelSequentialSingleThreadProcessor,
50
- (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_SINGLE_THREAD):
51
- NoSentinelPipelinedSingleThreadProcessor,
52
- (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL):
53
- NoSentinelPipelinedParallelProcessor,
54
- (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.SEQUENTIAL):
55
- MABSentinelSequentialSingleThreadProcessor,
56
- (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL):
57
- MABSentinelPipelinedParallelProcessor,
58
- (ProcessingStrategyType.STREAMING, ExecutionStrategyType.SEQUENTIAL):
59
- StreamingQueryProcessor,
60
- (ProcessingStrategyType.STREAMING, ExecutionStrategyType.PIPELINED_PARALLEL):
61
- StreamingQueryProcessor,
62
- (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.SEQUENTIAL):
63
- RandomSamplingSentinelSequentialSingleThreadProcessor,
64
- (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.PIPELINED_PARALLEL):
65
- RandomSamplingSentinelPipelinedParallelProcessor,
66
- }
29
+ @classmethod
30
+ def _normalize_strategies(cls, config: QueryProcessorConfig):
31
+ """
32
+ Convert the string representation of each strategy into its Enum equivalent and throw
33
+ an exception if the conversion fails.
34
+ """
35
+ strategy_types = {
36
+ "processing_strategy": ProcessingStrategyType,
37
+ "execution_strategy": ExecutionStrategyType,
38
+ "sentinel_execution_strategy": SentinelExecutionStrategyType,
39
+ "optimizer_strategy": OptimizationStrategyType,
40
+ }
41
+ for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
42
+ strategy_str = getattr(config, strategy)
43
+ strategy_type = strategy_types[strategy]
44
+ strategy_enum = None
45
+ if strategy_str is not None:
46
+ try:
47
+ strategy_enum = cls._convert_to_enum(strategy_type, strategy_str)
48
+ except ValueError as e:
49
+ raise ValueError(f"""Unsupported {strategy}: {strategy_str}.
50
+ The supported strategies are: {strategy_type.__members__.keys()}""") from e
51
+ setattr(config, strategy, strategy_enum)
52
+ logger.debug(f"Normalized {strategy}: {strategy_enum}")
53
+
54
+ return config
55
+
56
+ @classmethod
57
+ def _config_validation_and_normalization(cls, config: QueryProcessorConfig):
58
+ if config.policy is None:
59
+ raise ValueError("Policy is required for optimizer")
60
+
61
+ if config.cache:
62
+ raise ValueError("cache=True is not supported yet")
63
+
64
+ # only one of progress or verbose can be set; we will default to progress=True
65
+ if config.progress and config.verbose:
66
+ print("WARNING: Both `progress` and `verbose` are set to True, but only one can be True at a time; defaulting to `progress=True`")
67
+ config.verbose = False
68
+
69
+ # handle "auto" defaults for processing and sentinel execution strategies
70
+ if config.processing_strategy == "auto":
71
+ config.processing_strategy = "no_sentinel" if config.val_datasource is None else "sentinel"
72
+
73
+ if config.sentinel_execution_strategy == "auto":
74
+ config.sentinel_execution_strategy = None if config.val_datasource is None else "mab"
75
+
76
+ # convert the config values for processing, execution, and optimization strategies to enums
77
+ config = cls._normalize_strategies(config)
78
+
79
+ # check that processor uses a supported execution strategy
80
+ if config.execution_strategy not in config.processing_strategy.valid_execution_strategies():
81
+ raise ValueError(f"Unsupported `execution_strategy` {config.execution_strategy} for `processing_strategy` {config.processing_strategy}.")
82
+
83
+ # check that validation data is provided for sentinel execution
84
+ if config.val_datasource is None and config.processing_strategy.is_sentinel_strategy():
85
+ raise ValueError("`val_datasource` is required for SENTINEL processing strategies")
86
+
87
+ # check that sentinel execution is provided for sentinel processor
88
+ if config.sentinel_execution_strategy is None and config.processing_strategy.is_sentinel_strategy():
89
+ raise ValueError("`sentinel_execution_strategy` is required for SENTINEL processing strategies")
90
+
91
+ # get available models
92
+ available_models = getattr(config, 'available_models', [])
93
+ if available_models is None or len(available_models) == 0:
94
+ available_models = get_models(include_vision=True)
95
+ config.available_models = available_models
96
+
97
+ return config
98
+
99
+ @classmethod
100
+ def _create_optimizer(cls, config: QueryProcessorConfig) -> Optimizer:
101
+ return Optimizer(cost_model=CostModel(), **config.to_dict())
102
+
103
+ @classmethod
104
+ def _create_execution_strategy(cls, config: QueryProcessorConfig) -> ExecutionStrategy:
105
+ """
106
+ Creates an execution strategy based on the configuration.
107
+ """
108
+ execution_strategy_cls = config.execution_strategy.value
109
+ return execution_strategy_cls(**config.to_dict())
110
+
111
+ @classmethod
112
+ def _create_sentinel_execution_strategy(cls, config: QueryProcessorConfig) -> SentinelExecutionStrategy:
113
+ """
114
+ Creates an execution strategy based on the configuration.
115
+ """
116
+ if config.sentinel_execution_strategy is None:
117
+ return None
118
+
119
+ sentinel_execution_strategy_cls = config.sentinel_execution_strategy.value
120
+ return sentinel_execution_strategy_cls(**config.to_dict())
67
121
 
68
122
  @classmethod
69
123
  def create_processor(
@@ -83,91 +137,24 @@ class QueryProcessorFactory:
83
137
  if config is None:
84
138
  config = QueryProcessorConfig()
85
139
 
86
- # apply any additional keyword arguments to the config
140
+ # apply any additional keyword arguments to the config and validate its contents
87
141
  config.update(**kwargs)
88
-
89
142
  config = cls._config_validation_and_normalization(config)
90
- processing_strategy, execution_strategy, optimizer_strategy = cls._normalize_strategies(config)
91
- optimizer = cls._create_optimizer(optimizer_strategy, config)
92
143
 
93
- processor_key = (processing_strategy, execution_strategy)
94
- processor_cls = cls.PROCESSOR_MAPPING.get(processor_key)
144
+ # create the optimizer, execution strateg(ies), and processor
145
+ optimizer = cls._create_optimizer(config)
146
+ config.execution_strategy = cls._create_execution_strategy(config)
147
+ config.sentinel_execution_strategy = cls._create_sentinel_execution_strategy(config)
148
+ processor_cls = config.processing_strategy.value
149
+ processor = processor_cls(dataset, optimizer, **config.to_dict())
95
150
 
96
- if processor_cls is None:
97
- raise ValueError(f"Unsupported combination of processing strategy {processing_strategy} "
98
- f"and execution strategy {execution_strategy}")
99
-
100
- return processor_cls(dataset=dataset, optimizer=optimizer, config=config, **kwargs)
151
+ return processor
101
152
 
102
153
  @classmethod
103
154
  def create_and_run_processor(cls, dataset: Dataset, config: QueryProcessorConfig | None = None, **kwargs) -> DataRecordCollection:
104
155
  # TODO(Jun): Consider to use cache here.
156
+ logger.info(f"Creating processor for dataset: {dataset}")
105
157
  processor = cls.create_processor(dataset=dataset, config=config, **kwargs)
106
- return processor.execute()
158
+ logger.info(f"Created processor: {processor}")
107
159
 
108
- #TODO(Jun): The all avaliable plans could be generated earlier and outside Optimizer.
109
- @classmethod
110
- def _create_optimizer(cls, optimizer_strategy: OptimizationStrategyType, config: QueryProcessorConfig) -> Optimizer:
111
- available_models = getattr(config, 'available_models', []) or get_models(include_vision=True)
112
-
113
- if config.policy is None:
114
- raise ValueError("Policy is required for optimizer")
115
-
116
- return Optimizer(
117
- policy=config.policy,
118
- cost_model=CostModel(),
119
- no_cache=config.nocache,
120
- verbose=config.verbose,
121
- available_models=available_models,
122
- allow_bonded_query=config.allow_bonded_query,
123
- allow_conventional_query=config.allow_conventional_query,
124
- allow_code_synth=config.allow_code_synth,
125
- allow_token_reduction=config.allow_token_reduction,
126
- allow_rag_reduction=config.allow_rag_reduction,
127
- allow_mixtures=config.allow_mixtures,
128
- allow_critic=config.allow_critic,
129
- optimization_strategy_type=optimizer_strategy,
130
- use_final_op_quality=config.use_final_op_quality
131
- )
132
-
133
- @classmethod
134
- def _normalize_strategies(cls, config: QueryProcessorConfig):
135
- processing_strategy, execution_strategy, optimizer_strategy = config.processing_strategy, config.execution_strategy, config.optimizer_strategy
136
-
137
- if isinstance(processing_strategy, str):
138
- try:
139
- processing_strategy = convert_to_enum(ProcessingStrategyType, processing_strategy)
140
- except ValueError as e:
141
- raise ValueError(f"""Unsupported processing strategy: {processing_strategy}.
142
- The supported strategies are: {ProcessingStrategyType.__members__.keys()}""") from e
143
- if isinstance(execution_strategy, str):
144
- try:
145
- execution_strategy = convert_to_enum(ExecutionStrategyType, execution_strategy)
146
- except ValueError as e:
147
- raise ValueError(f"""Unsupported execution strategy: {execution_strategy}.
148
- The supported strategies are: {ExecutionStrategyType.__members__.keys()}""") from e
149
- if isinstance(optimizer_strategy, str):
150
- try:
151
- optimizer_strategy = convert_to_enum(OptimizationStrategyType, optimizer_strategy)
152
- except ValueError as e:
153
- raise ValueError(f"""Unsupported optimizer strategy: {optimizer_strategy}.
154
- The supported strategies are: {OptimizationStrategyType.__members__.keys()}""") from e
155
- return processing_strategy, execution_strategy, optimizer_strategy
156
-
157
- @classmethod
158
- def _config_validation_and_normalization(cls, config: QueryProcessorConfig):
159
- if config.policy is None:
160
- raise ValueError("Policy is required for optimizer")
161
-
162
- if not config.nocache:
163
- raise ValueError("nocache=False is not supported yet")
164
-
165
- if config.val_datasource is None and config.processing_strategy in [ProcessingStrategyType.MAB_SENTINEL, ProcessingStrategyType.RANDOM_SAMPLING]:
166
- raise ValueError("val_datasource is required for MAB_SENTINEL and RANDOM_SAMPLING processing strategies")
167
-
168
- available_models = getattr(config, 'available_models', [])
169
- if available_models is None or len(available_models) == 0:
170
- available_models = get_models(include_vision=True)
171
- config.available_models = available_models
172
-
173
- return config
160
+ return processor.execute()
@@ -0,0 +1,90 @@
1
+ import logging
2
+
3
+ from palimpzest.core.data.dataclasses import ExecutionStats, SentinelPlanStats
4
+ from palimpzest.core.elements.records import DataRecordCollection
5
+ from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
6
+ from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
7
+ from palimpzest.query.optimizer.plan import SentinelPlan
8
+ from palimpzest.query.processor.query_processor import QueryProcessor
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class SentinelQueryProcessor(QueryProcessor):
13
+
14
+ def _generate_sample_observations(self, sentinel_plan: SentinelPlan) -> SentinelPlanStats:
15
+ """
16
+ This function is responsible for generating sample observation data which can be
17
+ consumed by the CostModel.
18
+
19
+ To accomplish this, we construct a special sentinel plan using the Optimizer which is
20
+ capable of executing any valid physical implementation of a Filter or Convert operator
21
+ on each record.
22
+ """
23
+ # if we're using validation data, get the set of expected output records
24
+ expected_outputs = {}
25
+ for source_idx in range(len(self.val_datasource)):
26
+ expected_output = self.val_datasource[source_idx]
27
+ expected_outputs[source_idx] = expected_output
28
+
29
+ # execute sentinel plan; returns sentinel_plan_stats
30
+ return self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, expected_outputs)
31
+
32
+ def _create_sentinel_plan(self) -> SentinelPlan:
33
+ """
34
+ Generates and returns a SentinelPlan for the given dataset.
35
+ """
36
+ # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
37
+
38
+ # create a new optimizer and update its strategy to SENTINEL
39
+ optimizer = self.optimizer.deepcopy_clean()
40
+ optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
41
+
42
+ # create copy of dataset, but change its data source to the validation data source
43
+ dataset = self.dataset.copy()
44
+ dataset._set_data_source(self.val_datasource)
45
+
46
+ # get the sentinel plan for the given dataset
47
+ sentinel_plans = optimizer.optimize(dataset)
48
+ sentinel_plan = sentinel_plans[0]
49
+
50
+ return sentinel_plan
51
+
52
+ def execute(self) -> DataRecordCollection:
53
+ # for now, enforce that we are using validation data; we can relax this after paper submission
54
+ if self.val_datasource is None:
55
+ raise Exception("Make sure you are using validation data with SentinelQueryProcessor")
56
+ logger.info(f"Executing {self.__class__.__name__}")
57
+
58
+ # create execution stats
59
+ execution_stats = ExecutionStats(execution_id=self.execution_id())
60
+ execution_stats.start()
61
+
62
+ # create sentinel plan
63
+ sentinel_plan = self._create_sentinel_plan()
64
+
65
+ # generate sample execution data
66
+ sentinel_plan_stats = self._generate_sample_observations(sentinel_plan)
67
+
68
+ # update the execution stats to account for the work done in optimization
69
+ execution_stats.add_plan_stats(sentinel_plan_stats)
70
+ execution_stats.finish_optimization()
71
+
72
+ # (re-)initialize the optimizer
73
+ optimizer = self.optimizer.deepcopy_clean()
74
+
75
+ # construct the CostModel with any sample execution data we've gathered
76
+ cost_model = SampleBasedCostModel(sentinel_plan_stats, self.verbose)
77
+ optimizer.update_cost_model(cost_model)
78
+
79
+ # execute plan(s) according to the optimization strategy
80
+ records, plan_stats = self._execute_best_plan(self.dataset, optimizer)
81
+
82
+ # update the execution stats to account for the work to execute the final plan
83
+ execution_stats.add_plan_stats(plan_stats)
84
+ execution_stats.finish()
85
+
86
+ # construct and return the DataRecordCollection
87
+ result = DataRecordCollection(records, execution_stats=execution_stats)
88
+ logger.info("Done executing SentinelQueryProcessor")
89
+
90
+ return result