palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. palimpzest/constants.py +13 -4
  2. palimpzest/core/data/dataset.py +75 -5
  3. palimpzest/core/elements/groupbysig.py +5 -1
  4. palimpzest/core/elements/records.py +16 -7
  5. palimpzest/core/lib/schemas.py +26 -3
  6. palimpzest/core/models.py +4 -4
  7. palimpzest/prompts/aggregate_prompts.py +99 -0
  8. palimpzest/prompts/prompt_factory.py +162 -75
  9. palimpzest/prompts/utils.py +38 -1
  10. palimpzest/prompts/validator.py +24 -24
  11. palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  12. palimpzest/query/execution/execution_strategy.py +8 -8
  13. palimpzest/query/execution/mab_execution_strategy.py +30 -11
  14. palimpzest/query/execution/parallel_execution_strategy.py +31 -7
  15. palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
  16. palimpzest/query/generators/generators.py +9 -7
  17. palimpzest/query/operators/__init__.py +10 -6
  18. palimpzest/query/operators/aggregate.py +394 -10
  19. palimpzest/query/operators/convert.py +1 -1
  20. palimpzest/query/operators/join.py +279 -23
  21. palimpzest/query/operators/logical.py +36 -11
  22. palimpzest/query/operators/mixture_of_agents.py +3 -1
  23. palimpzest/query/operators/physical.py +5 -2
  24. palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
  25. palimpzest/query/optimizer/__init__.py +11 -3
  26. palimpzest/query/optimizer/cost_model.py +5 -5
  27. palimpzest/query/optimizer/optimizer.py +3 -2
  28. palimpzest/query/optimizer/plan.py +2 -3
  29. palimpzest/query/optimizer/rules.py +73 -13
  30. palimpzest/query/optimizer/tasks.py +4 -4
  31. palimpzest/utils/progress.py +19 -17
  32. palimpzest/validator/validator.py +7 -7
  33. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
  34. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
  35. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
  36. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0
palimpzest/constants.py CHANGED
@@ -136,13 +136,17 @@ class PromptStrategy(str, Enum):
136
136
  performing some task with a specified Model.
137
137
  """
138
138
 
139
+ # aggregation prompt strategies
140
+ AGG = "aggregation"
141
+ AGG_NO_REASONING = "aggregation-no-reasoning"
142
+
139
143
  # filter prompt strategies
140
144
  FILTER = "filter"
141
145
  FILTER_NO_REASONING = "filter-no-reasoning"
142
146
  FILTER_CRITIC = "filter-critic"
143
147
  FILTER_REFINE = "filter-refine"
144
148
  FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
145
- FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
149
+ FILTER_MOA_AGG = "filter-mixture-of-agents-aggregator"
146
150
  FILTER_SPLIT_PROPOSER = "filter-split-proposer"
147
151
  FILTER_SPLIT_MERGER = "filter-split-merger"
148
152
 
@@ -156,10 +160,13 @@ class PromptStrategy(str, Enum):
156
160
  MAP_CRITIC = "map-critic"
157
161
  MAP_REFINE = "map-refine"
158
162
  MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
159
- MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
163
+ MAP_MOA_AGG = "map-mixture-of-agents-aggregator"
160
164
  MAP_SPLIT_PROPOSER = "map-split-proposer"
161
165
  MAP_SPLIT_MERGER = "map-split-merger"
162
166
 
167
+ def is_agg_prompt(self):
168
+ return "aggregation" in self.value
169
+
163
170
  def is_filter_prompt(self):
164
171
  return "filter" in self.value
165
172
 
@@ -179,7 +186,7 @@ class PromptStrategy(str, Enum):
179
186
  return "mixture-of-agents-proposer" in self.value
180
187
 
181
188
  def is_moa_aggregator_prompt(self):
182
- return "mixture-of-agents-aggregation" in self.value
189
+ return "mixture-of-agents-aggregator" in self.value
183
190
 
184
191
  def is_split_proposer_prompt(self):
185
192
  return "split-proposer" in self.value
@@ -200,7 +207,9 @@ class Modality(str, Enum):
200
207
  class AggFunc(str, Enum):
201
208
  COUNT = "count"
202
209
  AVERAGE = "average"
203
-
210
+ SUM = "sum"
211
+ MIN = "min"
212
+ MAX = "max"
204
213
 
205
214
  class Cardinality(str, Enum):
206
215
  ONE_TO_ONE = "one-to-one"
@@ -22,7 +22,7 @@ from palimpzest.query.operators.logical import (
22
22
  LimitScan,
23
23
  LogicalOperator,
24
24
  Project,
25
- RetrieveScan,
25
+ TopKScan,
26
26
  )
27
27
  from palimpzest.query.processor.config import QueryProcessorConfig
28
28
  from palimpzest.utils.hash_helpers import hash_for_serialized_dict
@@ -243,7 +243,30 @@ class Dataset:
243
243
  id=self.id,
244
244
  )
245
245
 
246
- def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
246
+ def join(self, other: Dataset, on: str | list[str], how: str = "inner") -> Dataset:
247
+ """
248
+ Perform the specified join on the specified (list of) column(s)
249
+ """
250
+ # enforce type for on
251
+ if isinstance(on, str):
252
+ on = [on]
253
+
254
+ # construct new output schema
255
+ combined_schema = union_schemas([self.schema, other.schema], join=True, on=on)
256
+
257
+ # construct logical operator
258
+ operator = JoinOp(
259
+ input_schema=combined_schema,
260
+ output_schema=combined_schema,
261
+ condition="",
262
+ on=on,
263
+ how=how,
264
+ depends_on=on,
265
+ )
266
+
267
+ return Dataset(sources=[self, other], operator=operator, schema=combined_schema)
268
+
269
+ def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None, how: str = "inner") -> Dataset:
247
270
  """
248
271
  Perform a semantic (inner) join on the specified join predicate
249
272
  """
@@ -259,6 +282,7 @@ class Dataset:
259
282
  input_schema=combined_schema,
260
283
  output_schema=combined_schema,
261
284
  condition=condition,
285
+ how=how,
262
286
  desc=desc,
263
287
  depends_on=depends_on,
264
288
  )
@@ -346,7 +370,6 @@ class Dataset:
346
370
 
347
371
  return Dataset(sources=[self], operator=operator, schema=new_output_schema)
348
372
 
349
-
350
373
  def sem_add_columns(self, cols: list[dict] | type[BaseModel],
351
374
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
352
375
  desc: str | None = None,
@@ -534,12 +557,59 @@ class Dataset:
534
557
  operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
535
558
  return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
536
559
 
560
+ def sum(self) -> Dataset:
561
+ """Apply a summation to this set"""
562
+ operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.SUM)
563
+ return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
564
+
565
+ def min(self) -> Dataset:
566
+ """Apply an min operator to this set"""
567
+ operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
568
+ return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
569
+
570
+ def max(self) -> Dataset:
571
+ """Apply an max operator to this set"""
572
+ operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX)
573
+ return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
574
+
537
575
  def groupby(self, groupby: GroupBySig) -> Dataset:
538
576
  output_schema = groupby.output_schema()
539
577
  operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby)
540
578
  return Dataset(sources=[self], operator=operator, schema=output_schema)
541
579
 
542
- def retrieve(
580
+ def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset:
581
+ """
582
+ Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM
583
+ over the entire set of inputs' fields specified in `depends_on` to generate the output `col`.
584
+
585
+ Example:
586
+ sem_agg(
587
+ col={'name': 'overall_sentiment', 'desc': 'The overall sentiment of the reviews', 'type': str},
588
+ agg="Compute the overall sentiment of the reviews as POSITIVE or NEGATIVE.",
589
+ depends_on="review_text",
590
+ )
591
+ """
592
+ # construct new output schema
593
+ new_output_schema = None
594
+ if isinstance(col, dict):
595
+ col_schema = create_schema_from_fields([col])
596
+ new_output_schema = union_schemas([self.schema, col_schema])
597
+ elif issubclass(col, BaseModel):
598
+ assert len(col.model_fields) == 1, "For semantic aggregation, when passing a BaseModel to `col` it must have exactly one field."
599
+ new_output_schema = union_schemas([self.schema, col])
600
+ else:
601
+ raise ValueError("`col` must be a dictionary or a single-field BaseModel.")
602
+
603
+ # enforce type for depends_on
604
+ if isinstance(depends_on, str):
605
+ depends_on = [depends_on]
606
+
607
+ # construct logical operator
608
+ operator = Aggregate(input_schema=self.schema, output_schema=new_output_schema, agg_str=agg, depends_on=depends_on)
609
+
610
+ return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
611
+
612
+ def sem_topk(
543
613
  self,
544
614
  index: Collection,
545
615
  search_attr: str,
@@ -566,7 +636,7 @@ class Dataset:
566
636
  # index = index_factory(index)
567
637
 
568
638
  # construct logical operator
569
- operator = RetrieveScan(
639
+ operator = TopKScan(
570
640
  input_schema=self.schema,
571
641
  output_schema=new_output_schema,
572
642
  index=index,
@@ -6,8 +6,11 @@ from pydantic import BaseModel
6
6
 
7
7
  from palimpzest.core.lib.schemas import create_schema_from_fields
8
8
 
9
+ # TODO:
10
+ # - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
11
+ # - construct the correct output schema using the input schema and the group by and aggregation fields
12
+ # - remove/update all other references to GroupBySig in the codebase
9
13
 
10
- # TODO: need to rethink how group bys work
11
14
  # signature for a group by aggregate that applies
12
15
  # group and aggregation to an input tuple
13
16
  class GroupBySig:
@@ -50,6 +53,7 @@ class GroupBySig:
50
53
  ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
51
54
  return ops
52
55
 
56
+ # TODO: output schema needs to account for input schema types and create new output schema types
53
57
  def output_schema(self) -> type[BaseModel]:
54
58
  # the output class varies depending on the group by, so here
55
59
  # we dynamically construct this output
@@ -140,7 +140,7 @@ class DataRecord:
140
140
  def schema(self) -> type[BaseModel]:
141
141
  return type(self._data_item)
142
142
 
143
- def copy(self):
143
+ def copy(self) -> DataRecord:
144
144
  # get the set of fields to copy from the parent record
145
145
  copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
146
146
 
@@ -228,18 +228,18 @@ class DataRecord:
228
228
  @staticmethod
229
229
  def from_join_parents(
230
230
  schema: type[BaseModel],
231
- left_parent_record: DataRecord,
232
- right_parent_record: DataRecord,
231
+ left_parent_record: DataRecord | None,
232
+ right_parent_record: DataRecord | None,
233
233
  project_cols: list[str] | None = None,
234
234
  cardinality_idx: int = None,
235
235
  ) -> DataRecord:
236
236
  # get the set of fields and field descriptions to copy from the parent record(s)
237
- left_copy_field_names = (
237
+ left_copy_field_names = [] if left_parent_record is None else (
238
238
  left_parent_record.get_field_names()
239
239
  if project_cols is None
240
240
  else [col for col in project_cols if col in left_parent_record.get_field_names()]
241
241
  )
242
- right_copy_field_names = (
242
+ right_copy_field_names = [] if right_parent_record is None else (
243
243
  right_parent_record.get_field_names()
244
244
  if project_cols is None
245
245
  else [col for col in project_cols if col in right_parent_record.get_field_names()]
@@ -255,11 +255,20 @@ class DataRecord:
255
255
  new_field_name = f"{field_name}_right"
256
256
  data_item[new_field_name] = right_parent_record[field_name]
257
257
 
258
+ # for any missing fields in the schema, set them to None
259
+ for field_name in schema.model_fields:
260
+ if field_name not in data_item:
261
+ data_item[field_name] = None
262
+
258
263
  # make new record which has left and right parent record as its parents
264
+ left_parent_source_indices = [] if left_parent_record is None else list(left_parent_record._source_indices)
265
+ right_parent_source_indices = [] if right_parent_record is None else list(right_parent_record._source_indices)
266
+ left_parent_record_id = [] if left_parent_record is None else [left_parent_record._id]
267
+ right_parent_record_id = [] if right_parent_record is None else [right_parent_record._id]
259
268
  new_dr = DataRecord(
260
269
  schema(**data_item),
261
- source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
262
- parent_ids=[left_parent_record._id, right_parent_record._id],
270
+ source_indices=left_parent_source_indices + right_parent_source_indices,
271
+ parent_ids=left_parent_record_id + right_parent_record_id,
263
272
  cardinality_idx=cardinality_idx,
264
273
  )
265
274
 
@@ -142,16 +142,30 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
142
142
  return _create_pickleable_model(fields)
143
143
 
144
144
 
145
- def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
145
+ def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]:
146
146
  """Union multiple Pydantic models into a single model."""
147
+ # convert on to empty list if None
148
+ if on is None:
149
+ on = []
150
+
151
+ # build up the fields for the new schema
147
152
  fields = {}
148
153
  for model in models:
149
154
  for field_name, field in model.model_fields.items():
150
- if field_name in fields and not join:
155
+ # for non-join unions, make sure duplicate fields have the same type
156
+ if not join and field_name in fields:
151
157
  assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
152
- elif field_name in fields and join:
158
+
159
+ # for joins with "on" specified, no need to rename fields in "on"
160
+ elif join and field_name in on and field_name in fields:
161
+ continue
162
+
163
+ # otherwise, rename duplicate fields by appending _right
164
+ elif join and field_name in fields:
153
165
  while field_name in fields:
154
166
  field_name = f"{field_name}_right"
167
+
168
+ # add the field to the new schema
155
169
  fields[field_name] = (field.annotation, field)
156
170
 
157
171
  # create and return the new schema
@@ -194,6 +208,15 @@ class Average(BaseModel):
194
208
  class Count(BaseModel):
195
209
  count: int = Field(description="The count of items in the dataset")
196
210
 
211
+ class Sum(BaseModel):
212
+ sum: int = Field(description="The summation of items in the dataset")
213
+
214
+ class Min(BaseModel):
215
+ min: int | float = Field(description="The minimum value of some items in the dataset")
216
+
217
+ class Max(BaseModel):
218
+ max: int | float = Field(description="The maximum value of some items in the dataset")
219
+
197
220
  class OperatorDerivedSchema(BaseModel):
198
221
  """Schema defined by an operator, e.g., a join or a group by"""
199
222
 
palimpzest/core/models.py CHANGED
@@ -51,10 +51,10 @@ class GenerationStats(BaseModel):
51
51
  fn_call_duration_secs: float = 0.0
52
52
 
53
53
  # (if applicable) the total number of LLM calls made by this operator
54
- total_llm_calls: int = 0
54
+ total_llm_calls: float = 0
55
55
 
56
56
  # (if applicable) the total number of embedding LLM calls made by this operator
57
- total_embedding_llm_calls: int = 0
57
+ total_embedding_llm_calls: float = 0
58
58
 
59
59
  def __iadd__(self, other: GenerationStats) -> GenerationStats:
60
60
  # self.raw_answers.extend(other.raw_answers)
@@ -243,10 +243,10 @@ class RecordOpStats(BaseModel):
243
243
  fn_call_duration_secs: float = 0.0
244
244
 
245
245
  # (if applicable) the total number of LLM calls made by this operator
246
- total_llm_calls: int = 0
246
+ total_llm_calls: float = 0
247
247
 
248
248
  # (if applicable) the total number of embedding LLM calls made by this operator
249
- total_embedding_llm_calls: int = 0
249
+ total_embedding_llm_calls: float = 0
250
250
 
251
251
  # (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
252
252
  failed_convert: bool | None = None
@@ -0,0 +1,99 @@
1
+ """This file contains prompts for aggregation operations."""
2
+
3
+ ### BASE PROMPTS ###
4
+ AGG_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
5
+ You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
6
+ You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
7
+
8
+ {output_format_instruction} Finish your response with a newline character followed by ---
9
+
10
+ An example is shown below:
11
+ ---
12
+ INPUT FIELDS:
13
+ {example_input_fields}
14
+
15
+ OUTPUT FIELDS:
16
+ {example_output_fields}
17
+
18
+ CONTEXT:
19
+ {{{example_context}}}
20
+ {{{second_example_context}}}
21
+ {{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
22
+
23
+ AGGREGATION INSTRUCTION: {example_agg_instruction}
24
+
25
+ Let's think step-by-step in order to answer the question.
26
+
27
+ REASONING: {example_reasoning}
28
+
29
+ ANSWER:
30
+ {{{example_answer}}}
31
+ ---
32
+ """
33
+
34
+ AGG_NO_REASONING_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
35
+ You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
36
+ You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
37
+
38
+ {output_format_instruction} Finish your response with a newline character followed by ---
39
+
40
+ An example is shown below:
41
+ ---
42
+ INPUT FIELDS:
43
+ {example_input_fields}
44
+
45
+ OUTPUT FIELDS:
46
+ {example_output_fields}
47
+
48
+ CONTEXT:
49
+ {{{example_context}}}
50
+ {{{second_example_context}}}
51
+ {{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
52
+
53
+ AGGREGATION INSTRUCTION: {example_agg_instruction}
54
+
55
+ ANSWER:
56
+ {{{example_answer}}}
57
+ ---
58
+ """
59
+
60
+
61
+ AGG_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
62
+ You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
63
+ You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
64
+ {desc_section}
65
+ {output_format_instruction} Finish your response with a newline character followed by ---
66
+ ---
67
+ INPUT FIELDS:
68
+ {input_fields_desc}
69
+
70
+ OUTPUT FIELDS:
71
+ {output_fields_desc}
72
+
73
+ CONTEXT:
74
+ {context}<<image-audio-placeholder>>
75
+
76
+ AGGREGATION INSTRUCTION: {agg_instruction}
77
+
78
+ Let's think step-by-step in order to answer the question.
79
+
80
+ REASONING: """
81
+
82
+ AGG_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
83
+ You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
84
+ You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
85
+ {desc_section}
86
+ {output_format_instruction} Finish your response with a newline character followed by ---
87
+ ---
88
+ INPUT FIELDS:
89
+ {input_fields_desc}
90
+
91
+ OUTPUT FIELDS:
92
+ {output_fields_desc}
93
+
94
+ CONTEXT:
95
+ {context}<<image-audio-placeholder>>
96
+
97
+ AGGREGATION INSTRUCTION: {agg_instruction}
98
+
99
+ ANSWER: """