palimpzest 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +12 -4
- palimpzest/core/data/dataset.py +42 -0
- palimpzest/core/elements/records.py +5 -1
- palimpzest/core/lib/schemas.py +13 -0
- palimpzest/prompts/aggregate_prompts.py +99 -0
- palimpzest/prompts/prompt_factory.py +163 -75
- palimpzest/prompts/utils.py +38 -1
- palimpzest/prompts/validator.py +24 -24
- palimpzest/query/generators/generators.py +9 -7
- palimpzest/query/operators/__init__.py +4 -1
- palimpzest/query/operators/aggregate.py +285 -6
- palimpzest/query/operators/logical.py +17 -4
- palimpzest/query/optimizer/__init__.py +4 -0
- palimpzest/query/optimizer/rules.py +42 -2
- palimpzest/validator/validator.py +7 -7
- {palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/METADATA +1 -1
- {palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/RECORD +20 -19
- {palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/top_level.txt +0 -0
palimpzest/constants.py
CHANGED
|
@@ -136,13 +136,17 @@ class PromptStrategy(str, Enum):
|
|
|
136
136
|
performing some task with a specified Model.
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
|
+
# aggregation prompt strategies
|
|
140
|
+
AGG = "aggregation"
|
|
141
|
+
AGG_NO_REASONING = "aggregation-no-reasoning"
|
|
142
|
+
|
|
139
143
|
# filter prompt strategies
|
|
140
144
|
FILTER = "filter"
|
|
141
145
|
FILTER_NO_REASONING = "filter-no-reasoning"
|
|
142
146
|
FILTER_CRITIC = "filter-critic"
|
|
143
147
|
FILTER_REFINE = "filter-refine"
|
|
144
148
|
FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
|
|
145
|
-
FILTER_MOA_AGG = "filter-mixture-of-agents-
|
|
149
|
+
FILTER_MOA_AGG = "filter-mixture-of-agents-aggregator"
|
|
146
150
|
FILTER_SPLIT_PROPOSER = "filter-split-proposer"
|
|
147
151
|
FILTER_SPLIT_MERGER = "filter-split-merger"
|
|
148
152
|
|
|
@@ -156,10 +160,13 @@ class PromptStrategy(str, Enum):
|
|
|
156
160
|
MAP_CRITIC = "map-critic"
|
|
157
161
|
MAP_REFINE = "map-refine"
|
|
158
162
|
MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
|
|
159
|
-
MAP_MOA_AGG = "map-mixture-of-agents-
|
|
163
|
+
MAP_MOA_AGG = "map-mixture-of-agents-aggregator"
|
|
160
164
|
MAP_SPLIT_PROPOSER = "map-split-proposer"
|
|
161
165
|
MAP_SPLIT_MERGER = "map-split-merger"
|
|
162
166
|
|
|
167
|
+
def is_agg_prompt(self):
|
|
168
|
+
return "aggregation" in self.value
|
|
169
|
+
|
|
163
170
|
def is_filter_prompt(self):
|
|
164
171
|
return "filter" in self.value
|
|
165
172
|
|
|
@@ -179,7 +186,7 @@ class PromptStrategy(str, Enum):
|
|
|
179
186
|
return "mixture-of-agents-proposer" in self.value
|
|
180
187
|
|
|
181
188
|
def is_moa_aggregator_prompt(self):
|
|
182
|
-
return "mixture-of-agents-
|
|
189
|
+
return "mixture-of-agents-aggregator" in self.value
|
|
183
190
|
|
|
184
191
|
def is_split_proposer_prompt(self):
|
|
185
192
|
return "split-proposer" in self.value
|
|
@@ -200,7 +207,8 @@ class Modality(str, Enum):
|
|
|
200
207
|
class AggFunc(str, Enum):
|
|
201
208
|
COUNT = "count"
|
|
202
209
|
AVERAGE = "average"
|
|
203
|
-
|
|
210
|
+
MIN = "min"
|
|
211
|
+
MAX = "max"
|
|
204
212
|
|
|
205
213
|
class Cardinality(str, Enum):
|
|
206
214
|
ONE_TO_ONE = "one-to-one"
|
palimpzest/core/data/dataset.py
CHANGED
|
@@ -534,11 +534,53 @@ class Dataset:
|
|
|
534
534
|
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
|
|
535
535
|
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
536
536
|
|
|
537
|
+
def min(self) -> Dataset:
|
|
538
|
+
"""Apply an min operator to this set"""
|
|
539
|
+
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
|
|
540
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
541
|
+
|
|
542
|
+
def max(self) -> Dataset:
|
|
543
|
+
"""Apply an max operator to this set"""
|
|
544
|
+
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX)
|
|
545
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
546
|
+
|
|
537
547
|
def groupby(self, groupby: GroupBySig) -> Dataset:
|
|
538
548
|
output_schema = groupby.output_schema()
|
|
539
549
|
operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby)
|
|
540
550
|
return Dataset(sources=[self], operator=operator, schema=output_schema)
|
|
541
551
|
|
|
552
|
+
def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset:
|
|
553
|
+
"""
|
|
554
|
+
Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM
|
|
555
|
+
over the entire set of inputs' fields specified in `depends_on` to generate the output `col`.
|
|
556
|
+
|
|
557
|
+
Example:
|
|
558
|
+
sem_agg(
|
|
559
|
+
col={'name': 'overall_sentiment', 'desc': 'The overall sentiment of the reviews', 'type': str},
|
|
560
|
+
agg="Compute the overall sentiment of the reviews as POSITIVE or NEGATIVE.",
|
|
561
|
+
depends_on="review_text",
|
|
562
|
+
)
|
|
563
|
+
"""
|
|
564
|
+
# construct new output schema
|
|
565
|
+
new_output_schema = None
|
|
566
|
+
if isinstance(col, dict):
|
|
567
|
+
col_schema = create_schema_from_fields([col])
|
|
568
|
+
new_output_schema = union_schemas([self.schema, col_schema])
|
|
569
|
+
elif issubclass(col, BaseModel):
|
|
570
|
+
assert len(col.model_fields) == 1, "For semantic aggregation, when passing a BaseModel to `col` it must have exactly one field."
|
|
571
|
+
new_output_schema = union_schemas([self.schema, col])
|
|
572
|
+
else:
|
|
573
|
+
raise ValueError("`col` must be a dictionary or a single-field BaseModel.")
|
|
574
|
+
|
|
575
|
+
# enforce type for depends_on
|
|
576
|
+
if isinstance(depends_on, str):
|
|
577
|
+
depends_on = [depends_on]
|
|
578
|
+
|
|
579
|
+
# construct logical operator
|
|
580
|
+
operator = Aggregate(input_schema=self.schema, output_schema=new_output_schema, agg_str=agg, depends_on=depends_on)
|
|
581
|
+
|
|
582
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
583
|
+
|
|
542
584
|
def retrieve(
|
|
543
585
|
self,
|
|
544
586
|
index: Collection,
|
|
@@ -11,6 +11,8 @@ from pydantic.fields import FieldInfo
|
|
|
11
11
|
|
|
12
12
|
from palimpzest.core.data import context
|
|
13
13
|
from palimpzest.core.lib.schemas import (
|
|
14
|
+
AUDIO_FIELD_TYPES,
|
|
15
|
+
IMAGE_FIELD_TYPES,
|
|
14
16
|
AudioBase64,
|
|
15
17
|
AudioFilepath,
|
|
16
18
|
ImageBase64,
|
|
@@ -303,9 +305,11 @@ class DataRecord:
|
|
|
303
305
|
dct = {k: v for k, v in dct.items() if k in project_field_names}
|
|
304
306
|
|
|
305
307
|
if not include_bytes:
|
|
308
|
+
bytes_field_types = [bytes, list[bytes], bytes | None, list[bytes] | None, bytes | Any, list[bytes] | Any]
|
|
309
|
+
bytes_field_types += AUDIO_FIELD_TYPES + IMAGE_FIELD_TYPES
|
|
306
310
|
for k in dct:
|
|
307
311
|
field_type = self.get_field_type(k)
|
|
308
|
-
if field_type.annotation in
|
|
312
|
+
if field_type.annotation in bytes_field_types:
|
|
309
313
|
dct[k] = "<bytes>"
|
|
310
314
|
|
|
311
315
|
if bytes_to_str:
|
palimpzest/core/lib/schemas.py
CHANGED
|
@@ -33,20 +33,27 @@ IMAGE_LIST_FIELD_TYPES = [
|
|
|
33
33
|
list[ImageBase64] | None,
|
|
34
34
|
list[ImageFilepath] | None,
|
|
35
35
|
list[ImageURL] | None,
|
|
36
|
+
list[ImageBase64] | Any,
|
|
37
|
+
list[ImageFilepath] | Any,
|
|
38
|
+
list[ImageURL] | Any,
|
|
36
39
|
]
|
|
37
40
|
IMAGE_FIELD_TYPES = IMAGE_LIST_FIELD_TYPES + [
|
|
38
41
|
ImageBase64, ImageFilepath, ImageURL,
|
|
39
42
|
ImageBase64 | None, ImageFilepath | None, ImageURL | None,
|
|
43
|
+
ImageBase64 | Any, ImageFilepath | Any, ImageURL | Any,
|
|
40
44
|
]
|
|
41
45
|
AUDIO_LIST_FIELD_TYPES = [
|
|
42
46
|
list[AudioBase64],
|
|
43
47
|
list[AudioFilepath],
|
|
44
48
|
list[AudioBase64] | None,
|
|
45
49
|
list[AudioFilepath] | None,
|
|
50
|
+
list[AudioBase64] | Any,
|
|
51
|
+
list[AudioFilepath] | Any,
|
|
46
52
|
]
|
|
47
53
|
AUDIO_FIELD_TYPES = AUDIO_LIST_FIELD_TYPES + [
|
|
48
54
|
AudioBase64, AudioFilepath,
|
|
49
55
|
AudioBase64 | None, AudioFilepath | None,
|
|
56
|
+
AudioBase64 | Any, AudioFilepath | Any,
|
|
50
57
|
]
|
|
51
58
|
|
|
52
59
|
|
|
@@ -187,6 +194,12 @@ class Average(BaseModel):
|
|
|
187
194
|
class Count(BaseModel):
|
|
188
195
|
count: int = Field(description="The count of items in the dataset")
|
|
189
196
|
|
|
197
|
+
class Min(BaseModel):
|
|
198
|
+
min: int | float = Field(description="The minimum value of some items in the dataset")
|
|
199
|
+
|
|
200
|
+
class Max(BaseModel):
|
|
201
|
+
max: int | float = Field(description="The maximum value of some items in the dataset")
|
|
202
|
+
|
|
190
203
|
class OperatorDerivedSchema(BaseModel):
|
|
191
204
|
"""Schema defined by an operator, e.g., a join or a group by"""
|
|
192
205
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""This file contains prompts for aggregation operations."""
|
|
2
|
+
|
|
3
|
+
### BASE PROMPTS ###
|
|
4
|
+
AGG_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
5
|
+
You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
|
|
6
|
+
You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
|
|
7
|
+
|
|
8
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
9
|
+
|
|
10
|
+
An example is shown below:
|
|
11
|
+
---
|
|
12
|
+
INPUT FIELDS:
|
|
13
|
+
{example_input_fields}
|
|
14
|
+
|
|
15
|
+
OUTPUT FIELDS:
|
|
16
|
+
{example_output_fields}
|
|
17
|
+
|
|
18
|
+
CONTEXT:
|
|
19
|
+
{{{example_context}}}
|
|
20
|
+
{{{second_example_context}}}
|
|
21
|
+
{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
|
|
22
|
+
|
|
23
|
+
AGGREGATION INSTRUCTION: {example_agg_instruction}
|
|
24
|
+
|
|
25
|
+
Let's think step-by-step in order to answer the question.
|
|
26
|
+
|
|
27
|
+
REASONING: {example_reasoning}
|
|
28
|
+
|
|
29
|
+
ANSWER:
|
|
30
|
+
{{{example_answer}}}
|
|
31
|
+
---
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
AGG_NO_REASONING_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
35
|
+
You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
|
|
36
|
+
You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
|
|
37
|
+
|
|
38
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
39
|
+
|
|
40
|
+
An example is shown below:
|
|
41
|
+
---
|
|
42
|
+
INPUT FIELDS:
|
|
43
|
+
{example_input_fields}
|
|
44
|
+
|
|
45
|
+
OUTPUT FIELDS:
|
|
46
|
+
{example_output_fields}
|
|
47
|
+
|
|
48
|
+
CONTEXT:
|
|
49
|
+
{{{example_context}}}
|
|
50
|
+
{{{second_example_context}}}
|
|
51
|
+
{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
|
|
52
|
+
|
|
53
|
+
AGGREGATION INSTRUCTION: {example_agg_instruction}
|
|
54
|
+
|
|
55
|
+
ANSWER:
|
|
56
|
+
{{{example_answer}}}
|
|
57
|
+
---
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
AGG_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
62
|
+
You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
|
|
63
|
+
You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
|
|
64
|
+
{desc_section}
|
|
65
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
66
|
+
---
|
|
67
|
+
INPUT FIELDS:
|
|
68
|
+
{input_fields_desc}
|
|
69
|
+
|
|
70
|
+
OUTPUT FIELDS:
|
|
71
|
+
{output_fields_desc}
|
|
72
|
+
|
|
73
|
+
CONTEXT:
|
|
74
|
+
{context}<<image-audio-placeholder>>
|
|
75
|
+
|
|
76
|
+
AGGREGATION INSTRUCTION: {agg_instruction}
|
|
77
|
+
|
|
78
|
+
Let's think step-by-step in order to answer the question.
|
|
79
|
+
|
|
80
|
+
REASONING: """
|
|
81
|
+
|
|
82
|
+
AGG_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
83
|
+
You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
|
|
84
|
+
You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
|
|
85
|
+
{desc_section}
|
|
86
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
87
|
+
---
|
|
88
|
+
INPUT FIELDS:
|
|
89
|
+
{input_fields_desc}
|
|
90
|
+
|
|
91
|
+
OUTPUT FIELDS:
|
|
92
|
+
{output_fields_desc}
|
|
93
|
+
|
|
94
|
+
CONTEXT:
|
|
95
|
+
{context}<<image-audio-placeholder>>
|
|
96
|
+
|
|
97
|
+
AGGREGATION INSTRUCTION: {agg_instruction}
|
|
98
|
+
|
|
99
|
+
ANSWER: """
|