palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
- palimpzest-0.7.1.dist-info/RECORD +96 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.4.dist-info/RECORD +0 -87
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -107,7 +107,7 @@ class DataRecord:
|
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
def __hash__(self):
|
|
110
|
-
return hash(self.to_json_str())
|
|
110
|
+
return hash(self.to_json_str(bytes_to_str=True))
|
|
111
111
|
|
|
112
112
|
|
|
113
113
|
def __iter__(self):
|
|
@@ -131,6 +131,9 @@ class DataRecord:
|
|
|
131
131
|
cardinality_idx=self.cardinality_idx,
|
|
132
132
|
)
|
|
133
133
|
|
|
134
|
+
# copy the passed_operator attribute
|
|
135
|
+
new_dr.passed_operator = self.passed_operator
|
|
136
|
+
|
|
134
137
|
# get the set of fields to copy from the parent record
|
|
135
138
|
copy_field_names = project_cols if project_cols is not None else self.get_field_names()
|
|
136
139
|
copy_field_names = [field.split(".")[-1] for field in copy_field_names]
|
|
@@ -255,16 +258,16 @@ class DataRecord:
|
|
|
255
258
|
for record in records
|
|
256
259
|
])
|
|
257
260
|
|
|
258
|
-
def to_json_str(self, include_bytes: bool = True, project_cols: list[str] | None = None):
|
|
261
|
+
def to_json_str(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
|
|
259
262
|
"""Return a JSON representation of this DataRecord"""
|
|
260
|
-
record_dict = self.to_dict(include_bytes, project_cols)
|
|
263
|
+
record_dict = self.to_dict(include_bytes, bytes_to_str, project_cols)
|
|
261
264
|
record_dict = {
|
|
262
265
|
field_name: self.schema.field_to_json(field_name, field_value)
|
|
263
266
|
for field_name, field_value in record_dict.items()
|
|
264
267
|
}
|
|
265
268
|
return json.dumps(record_dict, indent=2)
|
|
266
269
|
|
|
267
|
-
def to_dict(self, include_bytes: bool = True, project_cols: list[str] | None = None):
|
|
270
|
+
def to_dict(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
|
|
268
271
|
"""Return a dictionary representation of this DataRecord"""
|
|
269
272
|
# TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
|
|
270
273
|
# Better ways to handle this.
|
|
@@ -276,9 +279,16 @@ class DataRecord:
|
|
|
276
279
|
|
|
277
280
|
if not include_bytes:
|
|
278
281
|
for k, v in dct.items():
|
|
279
|
-
if isinstance(v, bytes) or (isinstance(v, list) and len(v) > 0 and
|
|
282
|
+
if isinstance(v, bytes) or (isinstance(v, list) and len(v) > 0 and any([isinstance(elt, bytes) for elt in v])):
|
|
280
283
|
dct[k] = "<bytes>"
|
|
281
284
|
|
|
285
|
+
if bytes_to_str:
|
|
286
|
+
for k, v in dct.items():
|
|
287
|
+
if isinstance(v, bytes):
|
|
288
|
+
dct[k] = v.decode("utf-8")
|
|
289
|
+
elif isinstance(v, list) and len(v) > 0 and any([isinstance(elt, bytes) for elt in v]):
|
|
290
|
+
dct[k] = [elt.decode("utf-8") if isinstance(elt, bytes) else elt for elt in v]
|
|
291
|
+
|
|
282
292
|
return dct
|
|
283
293
|
|
|
284
294
|
|
|
@@ -290,7 +300,12 @@ class DataRecordSet:
|
|
|
290
300
|
|
|
291
301
|
The record_op_stats could be empty if the DataRecordSet is not from executing an operator.
|
|
292
302
|
"""
|
|
293
|
-
def __init__(
|
|
303
|
+
def __init__(
|
|
304
|
+
self,
|
|
305
|
+
data_records: list[DataRecord],
|
|
306
|
+
record_op_stats: list[RecordOpStats],
|
|
307
|
+
field_to_score_fn: dict[str, str | callable] | None = None,
|
|
308
|
+
):
|
|
294
309
|
# check that all data_records are derived from the same parent record
|
|
295
310
|
if len(data_records) > 0:
|
|
296
311
|
parent_id = data_records[0].parent_id
|
|
@@ -302,20 +317,27 @@ class DataRecordSet:
|
|
|
302
317
|
self.data_records = data_records
|
|
303
318
|
self.parent_id = data_records[0].parent_id if len(data_records) > 0 else None
|
|
304
319
|
self.source_idx = data_records[0].source_idx if len(data_records) > 0 else None
|
|
320
|
+
self.schema = data_records[0].schema if len(data_records) > 0 else None
|
|
305
321
|
|
|
306
322
|
# set statistics for generating these records
|
|
307
323
|
self.record_op_stats = record_op_stats
|
|
308
324
|
|
|
325
|
+
# assign field_to_score_fn if provided
|
|
326
|
+
self.field_to_score_fn = {} if field_to_score_fn is None else field_to_score_fn
|
|
309
327
|
|
|
310
|
-
def
|
|
311
|
-
return self.
|
|
328
|
+
def get_total_cost(self) -> float:
|
|
329
|
+
return sum([record_op_stats.cost_per_record for record_op_stats in self.record_op_stats])
|
|
312
330
|
|
|
331
|
+
def get_field_to_score_fn(self) -> dict[str, str | callable]:
|
|
332
|
+
return self.field_to_score_fn
|
|
313
333
|
|
|
314
|
-
def
|
|
315
|
-
return
|
|
334
|
+
def __getitem__(self, slice) -> DataRecord | list[DataRecord]:
|
|
335
|
+
return self.data_records[slice]
|
|
316
336
|
|
|
337
|
+
def __len__(self) -> int:
|
|
338
|
+
return len(self.data_records)
|
|
317
339
|
|
|
318
|
-
def __iter__(self):
|
|
340
|
+
def __iter__(self) -> Generator[DataRecord]:
|
|
319
341
|
yield from self.data_records
|
|
320
342
|
|
|
321
343
|
|
palimpzest/core/lib/fields.py
CHANGED
palimpzest/core/lib/schemas.py
CHANGED
|
@@ -348,6 +348,10 @@ class File(Schema):
|
|
|
348
348
|
filename = StringField(desc="The UNIX-style name of the file")
|
|
349
349
|
contents = BytesField(desc="The contents of the file")
|
|
350
350
|
|
|
351
|
+
class TextFile(Schema):
|
|
352
|
+
"""A text file is a File that contains only text. No binary data."""
|
|
353
|
+
filename = StringField(desc="The UNIX-style name of the file")
|
|
354
|
+
contents = StringField(desc="The contents of the file")
|
|
351
355
|
|
|
352
356
|
class Number(Schema):
|
|
353
357
|
"""Just a number. Often used for aggregates"""
|
|
@@ -418,9 +422,6 @@ class PDFFile(File):
|
|
|
418
422
|
text_contents = StringField(desc="The text-only contents of the PDF")
|
|
419
423
|
|
|
420
424
|
|
|
421
|
-
class TextFile(File):
|
|
422
|
-
"""A text file is a File that contains only text. No binary data."""
|
|
423
|
-
|
|
424
425
|
list_of_numbers = ListField(NumericField)
|
|
425
426
|
class XLSFile(File):
|
|
426
427
|
"""An XLS file is a File that contains one or more Excel spreadsheets."""
|
|
@@ -7,8 +7,6 @@ Be sure to cite information from the context as evidence of why your answers are
|
|
|
7
7
|
|
|
8
8
|
You will be provided with a description of each input field and each output field.
|
|
9
9
|
|
|
10
|
-
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
11
|
-
|
|
12
10
|
An example is shown below:
|
|
13
11
|
---
|
|
14
12
|
INPUT FIELDS:
|
|
@@ -31,8 +29,6 @@ You will be presented with a context and a set of output fields to generate. You
|
|
|
31
29
|
Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
|
|
32
30
|
|
|
33
31
|
You will be provided with a description of each input field and each output field.
|
|
34
|
-
|
|
35
|
-
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
36
32
|
---
|
|
37
33
|
INPUT FIELDS:
|
|
38
34
|
{input_fields_desc}
|
|
@@ -75,6 +75,19 @@ from palimpzest.prompts.moa_proposer_convert_prompts import (
|
|
|
75
75
|
COT_MOA_PROPOSER_IMAGE_JOB_INSTRUCTION,
|
|
76
76
|
COT_MOA_PROPOSER_JOB_INSTRUCTION,
|
|
77
77
|
)
|
|
78
|
+
from palimpzest.prompts.split_merge_prompts import (
|
|
79
|
+
COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT,
|
|
80
|
+
COT_SPLIT_MERGER_BASE_USER_PROMPT,
|
|
81
|
+
)
|
|
82
|
+
from palimpzest.prompts.split_proposer_prompts import (
|
|
83
|
+
COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT,
|
|
84
|
+
COT_SPLIT_PROPOSER_BASE_USER_PROMPT,
|
|
85
|
+
SPLIT_PROPOSER_EXAMPLE_ANSWER,
|
|
86
|
+
SPLIT_PROPOSER_EXAMPLE_CONTEXT,
|
|
87
|
+
SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS,
|
|
88
|
+
SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
|
|
89
|
+
SPLIT_PROPOSER_JOB_INSTRUCTION,
|
|
90
|
+
)
|
|
78
91
|
from palimpzest.prompts.util_phrases import (
|
|
79
92
|
ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION,
|
|
80
93
|
ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION,
|
|
@@ -96,6 +109,8 @@ class PromptFactory:
|
|
|
96
109
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_BASE_SYSTEM_PROMPT,
|
|
97
110
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_BASE_SYSTEM_PROMPT,
|
|
98
111
|
PromptStrategy.COT_MOA_AGG: COT_MOA_AGG_BASE_SYSTEM_PROMPT,
|
|
112
|
+
PromptStrategy.SPLIT_PROPOSER: COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT,
|
|
113
|
+
PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT,
|
|
99
114
|
}
|
|
100
115
|
BASE_USER_PROMPT_MAP = {
|
|
101
116
|
PromptStrategy.COT_BOOL: COT_BOOL_BASE_USER_PROMPT,
|
|
@@ -109,6 +124,8 @@ class PromptFactory:
|
|
|
109
124
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_BASE_USER_PROMPT,
|
|
110
125
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_BASE_USER_PROMPT,
|
|
111
126
|
PromptStrategy.COT_MOA_AGG: COT_MOA_AGG_BASE_USER_PROMPT,
|
|
127
|
+
PromptStrategy.SPLIT_PROPOSER: COT_SPLIT_PROPOSER_BASE_USER_PROMPT,
|
|
128
|
+
PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_USER_PROMPT,
|
|
112
129
|
}
|
|
113
130
|
|
|
114
131
|
def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality) -> None:
|
|
@@ -145,7 +162,7 @@ class PromptFactory:
|
|
|
145
162
|
# NOTE: MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT is a rough estimate which leaves room for the rest of the prompt text
|
|
146
163
|
while total_context_len * TOKENS_PER_CHARACTER > MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT:
|
|
147
164
|
# sort fields by length
|
|
148
|
-
field_lengths = [(field, len(value)) for field, value in context.items()]
|
|
165
|
+
field_lengths = [(field, len(value) if value is not None else 0) for field, value in context.items()]
|
|
149
166
|
sorted_fields = sorted(field_lengths, key=lambda item: item[1], reverse=True)
|
|
150
167
|
|
|
151
168
|
# get field with longest context
|
|
@@ -205,11 +222,7 @@ class PromptFactory:
|
|
|
205
222
|
"""
|
|
206
223
|
output_fields_desc = ""
|
|
207
224
|
output_schema: Schema = kwargs.get("output_schema")
|
|
208
|
-
if (
|
|
209
|
-
self.prompt_strategy.is_cot_qa_prompt()
|
|
210
|
-
or self.prompt_strategy.is_moa_proposer_prompt()
|
|
211
|
-
or self.prompt_strategy.is_moa_aggregator_prompt()
|
|
212
|
-
):
|
|
225
|
+
if self.prompt_strategy.is_convert_prompt():
|
|
213
226
|
assert output_schema is not None, "Output schema must be provided for convert prompts."
|
|
214
227
|
|
|
215
228
|
field_desc_map = output_schema.field_desc_map()
|
|
@@ -227,7 +240,7 @@ class PromptFactory:
|
|
|
227
240
|
str | None: The filter condition (if applicable).
|
|
228
241
|
"""
|
|
229
242
|
filter_condition = kwargs.get("filter_condition")
|
|
230
|
-
if self.prompt_strategy.
|
|
243
|
+
if self.prompt_strategy.is_bool_prompt():
|
|
231
244
|
assert filter_condition is not None, "Filter condition must be provided for filter operations."
|
|
232
245
|
|
|
233
246
|
return filter_condition
|
|
@@ -284,6 +297,24 @@ class PromptFactory:
|
|
|
284
297
|
|
|
285
298
|
return model_responses
|
|
286
299
|
|
|
300
|
+
def _get_chunk_outputs(self, **kwargs) -> str | None:
|
|
301
|
+
"""
|
|
302
|
+
Returns the chunk outputs for the split-convert.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
kwargs: The keyword arguments provided by the user.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
str | None: The chunk outputs.
|
|
309
|
+
"""
|
|
310
|
+
chunk_outputs = None
|
|
311
|
+
if self.prompt_strategy.is_split_merger_prompt():
|
|
312
|
+
chunk_outputs = ""
|
|
313
|
+
for idx, chunk_output in enumerate(kwargs.get("chunk_outputs")):
|
|
314
|
+
chunk_outputs += f"CHUNK OUTPUT {idx + 1}: {chunk_output}\n"
|
|
315
|
+
|
|
316
|
+
return chunk_outputs
|
|
317
|
+
|
|
287
318
|
def _get_output_format_instruction(self) -> str:
|
|
288
319
|
"""
|
|
289
320
|
Returns the output format instruction based on the cardinality.
|
|
@@ -311,6 +342,7 @@ class PromptFactory:
|
|
|
311
342
|
PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_JOB_INSTRUCTION,
|
|
312
343
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_JOB_INSTRUCTION,
|
|
313
344
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_JOB_INSTRUCTION,
|
|
345
|
+
PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_JOB_INSTRUCTION,
|
|
314
346
|
}
|
|
315
347
|
return prompt_strategy_to_job_instruction.get(self.prompt_strategy)
|
|
316
348
|
|
|
@@ -375,6 +407,7 @@ class PromptFactory:
|
|
|
375
407
|
PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_INPUT_FIELDS,
|
|
376
408
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_INPUT_FIELDS,
|
|
377
409
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_INPUT_FIELDS,
|
|
410
|
+
PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS,
|
|
378
411
|
}
|
|
379
412
|
|
|
380
413
|
return prompt_strategy_to_example_input_fields.get(self.prompt_strategy)
|
|
@@ -391,6 +424,7 @@ class PromptFactory:
|
|
|
391
424
|
PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_OUTPUT_FIELDS,
|
|
392
425
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
|
|
393
426
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_OUTPUT_FIELDS,
|
|
427
|
+
PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
|
|
394
428
|
}
|
|
395
429
|
|
|
396
430
|
return prompt_strategy_to_example_output_fields.get(self.prompt_strategy)
|
|
@@ -409,6 +443,7 @@ class PromptFactory:
|
|
|
409
443
|
PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_CONTEXT,
|
|
410
444
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_CONTEXT,
|
|
411
445
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_CONTEXT,
|
|
446
|
+
PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_CONTEXT,
|
|
412
447
|
}
|
|
413
448
|
|
|
414
449
|
return prompt_strategy_to_example_context.get(self.prompt_strategy)
|
|
@@ -471,6 +506,7 @@ class PromptFactory:
|
|
|
471
506
|
PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_ANSWER,
|
|
472
507
|
PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_ANSWER,
|
|
473
508
|
PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_ANSWER,
|
|
509
|
+
PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_ANSWER,
|
|
474
510
|
}
|
|
475
511
|
|
|
476
512
|
return prompt_strategy_to_example_answer.get(self.prompt_strategy)
|
|
@@ -499,6 +535,7 @@ class PromptFactory:
|
|
|
499
535
|
"original_output": self._get_original_output(**kwargs),
|
|
500
536
|
"critique_output": self._get_critique_output(**kwargs),
|
|
501
537
|
"model_responses": self._get_model_responses(**kwargs),
|
|
538
|
+
"chunk_outputs": self._get_chunk_outputs(**kwargs),
|
|
502
539
|
}
|
|
503
540
|
|
|
504
541
|
# get format kwargs which depend on the prompt strategy
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""This file contains prompts for SplitConvert aggregator operations."""
|
|
2
|
+
|
|
3
|
+
### SYSTEM PROMPTS ###
|
|
4
|
+
COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
|
|
5
|
+
You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
|
|
6
|
+
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
7
|
+
|
|
8
|
+
You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
|
|
9
|
+
|
|
10
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
11
|
+
|
|
12
|
+
An example is shown below:
|
|
13
|
+
---
|
|
14
|
+
CHUNK 1 OUTPUT: the text mentions the scientists "Augusta Ada King, Countess of Lovelace" and "Charles Babbage". It states that King was an English mathematician who worked on Babbage's Analytical Engine.
|
|
15
|
+
|
|
16
|
+
CHUNK 2 OUTPUT: the text passage mentions the scientist "Charles Babbage", who was a mathematician. Therefore, the name output should be ["Charles Babbage"] and the field_of_study output should be ["Mathematician"].
|
|
17
|
+
|
|
18
|
+
INPUT FIELDS:
|
|
19
|
+
- text: a text passage describing scientists
|
|
20
|
+
|
|
21
|
+
OUTPUT FIELDS:
|
|
22
|
+
- name: the list of names for each scientist mentioned in the text
|
|
23
|
+
- field_of_study: a list with the field of study for each scientist
|
|
24
|
+
|
|
25
|
+
Let's think step-by-step in order to answer the question.
|
|
26
|
+
|
|
27
|
+
REASONING: Looking at both chunk outputs, they specify that the scientists' formal names are "Augusta Ada King" and "Charles Babbage". Chunk Output 2 indicates that Charles Babbage was a Mathematician and Chunk Output 1 says that Augusta Ada King was an English mathematician. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"].
|
|
28
|
+
|
|
29
|
+
ANSWER:
|
|
30
|
+
{{
|
|
31
|
+
"name": ["Augusta Ada King", "Charles Babbage"],
|
|
32
|
+
"field_of_study": ["Mathematician", "Mathematician"]
|
|
33
|
+
}}
|
|
34
|
+
---
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
### USER / INSTANCE-SPECIFIC PROMPTS ###
|
|
38
|
+
COT_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
|
|
39
|
+
You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
|
|
40
|
+
It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
|
|
41
|
+
|
|
42
|
+
You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
|
|
43
|
+
|
|
44
|
+
{output_format_instruction} Finish your response with a newline character followed by ---
|
|
45
|
+
---
|
|
46
|
+
{chunk_outputs}
|
|
47
|
+
|
|
48
|
+
INPUT FIELDS:
|
|
49
|
+
{input_fields_desc}
|
|
50
|
+
|
|
51
|
+
OUTPUT FIELDS:
|
|
52
|
+
{output_fields_desc}
|
|
53
|
+
|
|
54
|
+
Let's think step-by-step in order to answer the question.
|
|
55
|
+
|
|
56
|
+
REASONING: """
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""This file contains prompts for SplitConvert operations on text inputs."""
|
|
2
|
+
|
|
3
|
+
### BASE PROMPTS ###
|
|
4
|
+
COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
5
|
+
You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
|
|
6
|
+
Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
|
|
7
|
+
|
|
8
|
+
You will be provided with a description of each input field and each output field.
|
|
9
|
+
|
|
10
|
+
An example is shown below:
|
|
11
|
+
---
|
|
12
|
+
INPUT FIELDS:
|
|
13
|
+
{example_input_fields}
|
|
14
|
+
|
|
15
|
+
OUTPUT FIELDS:
|
|
16
|
+
{example_output_fields}
|
|
17
|
+
|
|
18
|
+
CONTEXT:
|
|
19
|
+
{example_context}
|
|
20
|
+
|
|
21
|
+
Let's think step-by-step in order to answer the question.
|
|
22
|
+
|
|
23
|
+
ANSWER: {example_answer}
|
|
24
|
+
---
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
|
|
28
|
+
You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
|
|
29
|
+
Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
|
|
30
|
+
|
|
31
|
+
You will be provided with a description of each input field and each output field.
|
|
32
|
+
---
|
|
33
|
+
INPUT FIELDS:
|
|
34
|
+
{input_fields_desc}
|
|
35
|
+
|
|
36
|
+
OUTPUT FIELDS:
|
|
37
|
+
{output_fields_desc}
|
|
38
|
+
|
|
39
|
+
CONTEXT:
|
|
40
|
+
{context}
|
|
41
|
+
|
|
42
|
+
Let's think step-by-step in order to answer the question.
|
|
43
|
+
|
|
44
|
+
ANSWER: """
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
### TEMPLATE INPUTS ###
|
|
48
|
+
SPLIT_PROPOSER_JOB_INSTRUCTION = "produce an answer to a question"
|
|
49
|
+
SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS = """- text: a text passage describing scientists"""
|
|
50
|
+
SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS = """- name: the list of names for each scientist mentioned in the text
|
|
51
|
+
- field_of_study: a list with the field of study for each scientist"""
|
|
52
|
+
SPLIT_PROPOSER_EXAMPLE_CONTEXT = """{{
|
|
53
|
+
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, born December 10, 1815 was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation."
|
|
54
|
+
}}"""
|
|
55
|
+
SPLIT_PROPOSER_EXAMPLE_ANSWER = """the text passage mentions the scientists "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and "Charles Babbage", both of whom were mathematicians. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"]."""
|