palimpzest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.0.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
@@ -107,7 +107,7 @@ class DataRecord:
107
107
 
108
108
 
109
109
  def __hash__(self):
110
- return hash(self.to_json_str())
110
+ return hash(self.to_json_str(bytes_to_str=True))
111
111
 
112
112
 
113
113
  def __iter__(self):
@@ -131,6 +131,9 @@ class DataRecord:
131
131
  cardinality_idx=self.cardinality_idx,
132
132
  )
133
133
 
134
+ # copy the passed_operator attribute
135
+ new_dr.passed_operator = self.passed_operator
136
+
134
137
  # get the set of fields to copy from the parent record
135
138
  copy_field_names = project_cols if project_cols is not None else self.get_field_names()
136
139
  copy_field_names = [field.split(".")[-1] for field in copy_field_names]
@@ -255,16 +258,16 @@ class DataRecord:
255
258
  for record in records
256
259
  ])
257
260
 
258
- def to_json_str(self, include_bytes: bool = True, project_cols: list[str] | None = None):
261
+ def to_json_str(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
259
262
  """Return a JSON representation of this DataRecord"""
260
- record_dict = self.to_dict(include_bytes, project_cols)
263
+ record_dict = self.to_dict(include_bytes, bytes_to_str, project_cols)
261
264
  record_dict = {
262
265
  field_name: self.schema.field_to_json(field_name, field_value)
263
266
  for field_name, field_value in record_dict.items()
264
267
  }
265
268
  return json.dumps(record_dict, indent=2)
266
269
 
267
- def to_dict(self, include_bytes: bool = True, project_cols: list[str] | None = None):
270
+ def to_dict(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
268
271
  """Return a dictionary representation of this DataRecord"""
269
272
  # TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
270
273
  # Better ways to handle this.
@@ -276,9 +279,16 @@ class DataRecord:
276
279
 
277
280
  if not include_bytes:
278
281
  for k, v in dct.items():
279
- if isinstance(v, bytes) or (isinstance(v, list) and len(v) > 0 and isinstance(v[0], bytes)):
282
+ if isinstance(v, bytes) or (isinstance(v, list) and len(v) > 0 and any([isinstance(elt, bytes) for elt in v])):
280
283
  dct[k] = "<bytes>"
281
284
 
285
+ if bytes_to_str:
286
+ for k, v in dct.items():
287
+ if isinstance(v, bytes):
288
+ dct[k] = v.decode("utf-8")
289
+ elif isinstance(v, list) and len(v) > 0 and any([isinstance(elt, bytes) for elt in v]):
290
+ dct[k] = [elt.decode("utf-8") if isinstance(elt, bytes) else elt for elt in v]
291
+
282
292
  return dct
283
293
 
284
294
 
@@ -290,7 +300,12 @@ class DataRecordSet:
290
300
 
291
301
  The record_op_stats could be empty if the DataRecordSet is not from executing an operator.
292
302
  """
293
- def __init__(self, data_records: list[DataRecord], record_op_stats: list[RecordOpStats]):
303
+ def __init__(
304
+ self,
305
+ data_records: list[DataRecord],
306
+ record_op_stats: list[RecordOpStats],
307
+ field_to_score_fn: dict[str, str | callable] | None = None,
308
+ ):
294
309
  # check that all data_records are derived from the same parent record
295
310
  if len(data_records) > 0:
296
311
  parent_id = data_records[0].parent_id
@@ -302,20 +317,27 @@ class DataRecordSet:
302
317
  self.data_records = data_records
303
318
  self.parent_id = data_records[0].parent_id if len(data_records) > 0 else None
304
319
  self.source_idx = data_records[0].source_idx if len(data_records) > 0 else None
320
+ self.schema = data_records[0].schema if len(data_records) > 0 else None
305
321
 
306
322
  # set statistics for generating these records
307
323
  self.record_op_stats = record_op_stats
308
324
 
325
+ # assign field_to_score_fn if provided
326
+ self.field_to_score_fn = {} if field_to_score_fn is None else field_to_score_fn
309
327
 
310
- def __getitem__(self, slice):
311
- return self.data_records[slice]
328
+ def get_total_cost(self) -> float:
329
+ return sum([record_op_stats.cost_per_record for record_op_stats in self.record_op_stats])
312
330
 
331
+ def get_field_to_score_fn(self) -> dict[str, str | callable]:
332
+ return self.field_to_score_fn
313
333
 
314
- def __len__(self):
315
- return len(self.data_records)
334
+ def __getitem__(self, slice) -> DataRecord | list[DataRecord]:
335
+ return self.data_records[slice]
316
336
 
337
+ def __len__(self) -> int:
338
+ return len(self.data_records)
317
339
 
318
- def __iter__(self):
340
+ def __iter__(self) -> Generator[DataRecord]:
319
341
  yield from self.data_records
320
342
 
321
343
 
@@ -135,6 +135,7 @@ class ListField(Field):
135
135
  "element_type": element_type,
136
136
  "is_image_field": element_type.is_image_field,
137
137
  "type": list,
138
+ "_desc": desc,
138
139
  }
139
140
 
140
141
  return type(f"List[{element_type.__name__}]", (Field,), attrs)
@@ -348,6 +348,10 @@ class File(Schema):
348
348
  filename = StringField(desc="The UNIX-style name of the file")
349
349
  contents = BytesField(desc="The contents of the file")
350
350
 
351
+ class TextFile(Schema):
352
+ """A text file is a File that contains only text. No binary data."""
353
+ filename = StringField(desc="The UNIX-style name of the file")
354
+ contents = StringField(desc="The contents of the file")
351
355
 
352
356
  class Number(Schema):
353
357
  """Just a number. Often used for aggregates"""
@@ -418,9 +422,6 @@ class PDFFile(File):
418
422
  text_contents = StringField(desc="The text-only contents of the PDF")
419
423
 
420
424
 
421
- class TextFile(File):
422
- """A text file is a File that contains only text. No binary data."""
423
-
424
425
  list_of_numbers = ListField(NumericField)
425
426
  class XLSFile(File):
426
427
  """An XLS file is a File that contains one or more Excel spreadsheets."""
@@ -7,8 +7,6 @@ Be sure to cite information from the context as evidence of why your answers are
7
7
 
8
8
  You will be provided with a description of each input field and each output field.
9
9
 
10
- {output_format_instruction} Finish your response with a newline character followed by ---
11
-
12
10
  An example is shown below:
13
11
  ---
14
12
  INPUT FIELDS:
@@ -31,8 +29,6 @@ You will be presented with a context and a set of output fields to generate. You
31
29
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
32
30
 
33
31
  You will be provided with a description of each input field and each output field.
34
-
35
- {output_format_instruction} Finish your response with a newline character followed by ---
36
32
  ---
37
33
  INPUT FIELDS:
38
34
  {input_fields_desc}
@@ -75,6 +75,19 @@ from palimpzest.prompts.moa_proposer_convert_prompts import (
75
75
  COT_MOA_PROPOSER_IMAGE_JOB_INSTRUCTION,
76
76
  COT_MOA_PROPOSER_JOB_INSTRUCTION,
77
77
  )
78
+ from palimpzest.prompts.split_merge_prompts import (
79
+ COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT,
80
+ COT_SPLIT_MERGER_BASE_USER_PROMPT,
81
+ )
82
+ from palimpzest.prompts.split_proposer_prompts import (
83
+ COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT,
84
+ COT_SPLIT_PROPOSER_BASE_USER_PROMPT,
85
+ SPLIT_PROPOSER_EXAMPLE_ANSWER,
86
+ SPLIT_PROPOSER_EXAMPLE_CONTEXT,
87
+ SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS,
88
+ SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
89
+ SPLIT_PROPOSER_JOB_INSTRUCTION,
90
+ )
78
91
  from palimpzest.prompts.util_phrases import (
79
92
  ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION,
80
93
  ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION,
@@ -96,6 +109,8 @@ class PromptFactory:
96
109
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_BASE_SYSTEM_PROMPT,
97
110
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_BASE_SYSTEM_PROMPT,
98
111
  PromptStrategy.COT_MOA_AGG: COT_MOA_AGG_BASE_SYSTEM_PROMPT,
112
+ PromptStrategy.SPLIT_PROPOSER: COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT,
113
+ PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT,
99
114
  }
100
115
  BASE_USER_PROMPT_MAP = {
101
116
  PromptStrategy.COT_BOOL: COT_BOOL_BASE_USER_PROMPT,
@@ -109,6 +124,8 @@ class PromptFactory:
109
124
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_BASE_USER_PROMPT,
110
125
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_BASE_USER_PROMPT,
111
126
  PromptStrategy.COT_MOA_AGG: COT_MOA_AGG_BASE_USER_PROMPT,
127
+ PromptStrategy.SPLIT_PROPOSER: COT_SPLIT_PROPOSER_BASE_USER_PROMPT,
128
+ PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_USER_PROMPT,
112
129
  }
113
130
 
114
131
  def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality) -> None:
@@ -145,7 +162,7 @@ class PromptFactory:
145
162
  # NOTE: MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT is a rough estimate which leaves room for the rest of the prompt text
146
163
  while total_context_len * TOKENS_PER_CHARACTER > MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT:
147
164
  # sort fields by length
148
- field_lengths = [(field, len(value)) for field, value in context.items()]
165
+ field_lengths = [(field, len(value) if value is not None else 0) for field, value in context.items()]
149
166
  sorted_fields = sorted(field_lengths, key=lambda item: item[1], reverse=True)
150
167
 
151
168
  # get field with longest context
@@ -205,11 +222,7 @@ class PromptFactory:
205
222
  """
206
223
  output_fields_desc = ""
207
224
  output_schema: Schema = kwargs.get("output_schema")
208
- if (
209
- self.prompt_strategy.is_cot_qa_prompt()
210
- or self.prompt_strategy.is_moa_proposer_prompt()
211
- or self.prompt_strategy.is_moa_aggregator_prompt()
212
- ):
225
+ if self.prompt_strategy.is_convert_prompt():
213
226
  assert output_schema is not None, "Output schema must be provided for convert prompts."
214
227
 
215
228
  field_desc_map = output_schema.field_desc_map()
@@ -227,7 +240,7 @@ class PromptFactory:
227
240
  str | None: The filter condition (if applicable).
228
241
  """
229
242
  filter_condition = kwargs.get("filter_condition")
230
- if self.prompt_strategy.is_cot_bool_prompt():
243
+ if self.prompt_strategy.is_bool_prompt():
231
244
  assert filter_condition is not None, "Filter condition must be provided for filter operations."
232
245
 
233
246
  return filter_condition
@@ -284,6 +297,24 @@ class PromptFactory:
284
297
 
285
298
  return model_responses
286
299
 
300
+ def _get_chunk_outputs(self, **kwargs) -> str | None:
301
+ """
302
+ Returns the chunk outputs for the split-convert.
303
+
304
+ Args:
305
+ kwargs: The keyword arguments provided by the user.
306
+
307
+ Returns:
308
+ str | None: The chunk outputs.
309
+ """
310
+ chunk_outputs = None
311
+ if self.prompt_strategy.is_split_merger_prompt():
312
+ chunk_outputs = ""
313
+ for idx, chunk_output in enumerate(kwargs.get("chunk_outputs")):
314
+ chunk_outputs += f"CHUNK OUTPUT {idx + 1}: {chunk_output}\n"
315
+
316
+ return chunk_outputs
317
+
287
318
  def _get_output_format_instruction(self) -> str:
288
319
  """
289
320
  Returns the output format instruction based on the cardinality.
@@ -311,6 +342,7 @@ class PromptFactory:
311
342
  PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_JOB_INSTRUCTION,
312
343
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_JOB_INSTRUCTION,
313
344
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_JOB_INSTRUCTION,
345
+ PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_JOB_INSTRUCTION,
314
346
  }
315
347
  return prompt_strategy_to_job_instruction.get(self.prompt_strategy)
316
348
 
@@ -375,6 +407,7 @@ class PromptFactory:
375
407
  PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_INPUT_FIELDS,
376
408
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_INPUT_FIELDS,
377
409
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_INPUT_FIELDS,
410
+ PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS,
378
411
  }
379
412
 
380
413
  return prompt_strategy_to_example_input_fields.get(self.prompt_strategy)
@@ -391,6 +424,7 @@ class PromptFactory:
391
424
  PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_OUTPUT_FIELDS,
392
425
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
393
426
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_OUTPUT_FIELDS,
427
+ PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS,
394
428
  }
395
429
 
396
430
  return prompt_strategy_to_example_output_fields.get(self.prompt_strategy)
@@ -409,6 +443,7 @@ class PromptFactory:
409
443
  PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_CONTEXT,
410
444
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_CONTEXT,
411
445
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_CONTEXT,
446
+ PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_CONTEXT,
412
447
  }
413
448
 
414
449
  return prompt_strategy_to_example_context.get(self.prompt_strategy)
@@ -471,6 +506,7 @@ class PromptFactory:
471
506
  PromptStrategy.COT_QA_IMAGE: COT_QA_IMAGE_EXAMPLE_ANSWER,
472
507
  PromptStrategy.COT_MOA_PROPOSER: COT_MOA_PROPOSER_EXAMPLE_ANSWER,
473
508
  PromptStrategy.COT_MOA_PROPOSER_IMAGE: COT_MOA_PROPOSER_IMAGE_EXAMPLE_ANSWER,
509
+ PromptStrategy.SPLIT_PROPOSER: SPLIT_PROPOSER_EXAMPLE_ANSWER,
474
510
  }
475
511
 
476
512
  return prompt_strategy_to_example_answer.get(self.prompt_strategy)
@@ -499,6 +535,7 @@ class PromptFactory:
499
535
  "original_output": self._get_original_output(**kwargs),
500
536
  "critique_output": self._get_critique_output(**kwargs),
501
537
  "model_responses": self._get_model_responses(**kwargs),
538
+ "chunk_outputs": self._get_chunk_outputs(**kwargs),
502
539
  }
503
540
 
504
541
  # get format kwargs which depend on the prompt strategy
@@ -0,0 +1,56 @@
1
+ """This file contains prompts for SplitConvert aggregator operations."""
2
+
3
+ ### SYSTEM PROMPTS ###
4
+ COT_SPLIT_MERGER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
5
+ You will be presented with one or more outputs produced by a set of models operating on chunks of an input. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
6
+ It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
7
+
8
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
9
+
10
+ {output_format_instruction} Finish your response with a newline character followed by ---
11
+
12
+ An example is shown below:
13
+ ---
14
+ CHUNK 1 OUTPUT: the text mentions the scientists "Augusta Ada King, Countess of Lovelace" and "Charles Babbage". It states that King was an English mathematician who worked on Babbage's Analytical Engine.
15
+
16
+ CHUNK 2 OUTPUT: the text passage mentions the scientist "Charles Babbage", who was a mathematician. Therefore, the name output should be ["Charles Babbage"] and the field_of_study output should be ["Mathematician"].
17
+
18
+ INPUT FIELDS:
19
+ - text: a text passage describing scientists
20
+
21
+ OUTPUT FIELDS:
22
+ - name: the list of names for each scientist mentioned in the text
23
+ - field_of_study: a list with the field of study for each scientist
24
+
25
+ Let's think step-by-step in order to answer the question.
26
+
27
+ REASONING: Looking at both chunk outputs, they specify that the scientists' formal names are "Augusta Ada King" and "Charles Babbage". Chunk Output 2 indicates that Charles Babbage was a Mathematician and Chunk Output 1 says that Augusta Ada King was an English mathematician. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"].
28
+
29
+ ANSWER:
30
+ {{
31
+ "name": ["Augusta Ada King", "Charles Babbage"],
32
+ "field_of_study": ["Mathematician", "Mathematician"]
33
+ }}
34
+ ---
35
+ """
36
+
37
+ ### USER / INSTANCE-SPECIFIC PROMPTS ###
38
+ COT_SPLIT_MERGER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
39
+ You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
40
+ It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased, incorrect, or contain duplicates.
41
+
42
+ You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
43
+
44
+ {output_format_instruction} Finish your response with a newline character followed by ---
45
+ ---
46
+ {chunk_outputs}
47
+
48
+ INPUT FIELDS:
49
+ {input_fields_desc}
50
+
51
+ OUTPUT FIELDS:
52
+ {output_fields_desc}
53
+
54
+ Let's think step-by-step in order to answer the question.
55
+
56
+ REASONING: """
@@ -0,0 +1,55 @@
1
+ """This file contains prompts for SplitConvert operations on text inputs."""
2
+
3
+ ### BASE PROMPTS ###
4
+ COT_SPLIT_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
5
+ You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
6
+ Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
7
+
8
+ You will be provided with a description of each input field and each output field.
9
+
10
+ An example is shown below:
11
+ ---
12
+ INPUT FIELDS:
13
+ {example_input_fields}
14
+
15
+ OUTPUT FIELDS:
16
+ {example_output_fields}
17
+
18
+ CONTEXT:
19
+ {example_context}
20
+
21
+ Let's think step-by-step in order to answer the question.
22
+
23
+ ANSWER: {example_answer}
24
+ ---
25
+ """
26
+
27
+ COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
+ You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
29
+ Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
30
+
31
+ You will be provided with a description of each input field and each output field.
32
+ ---
33
+ INPUT FIELDS:
34
+ {input_fields_desc}
35
+
36
+ OUTPUT FIELDS:
37
+ {output_fields_desc}
38
+
39
+ CONTEXT:
40
+ {context}
41
+
42
+ Let's think step-by-step in order to answer the question.
43
+
44
+ ANSWER: """
45
+
46
+
47
+ ### TEMPLATE INPUTS ###
48
+ SPLIT_PROPOSER_JOB_INSTRUCTION = "produce an answer to a question"
49
+ SPLIT_PROPOSER_EXAMPLE_INPUT_FIELDS = """- text: a text passage describing scientists"""
50
+ SPLIT_PROPOSER_EXAMPLE_OUTPUT_FIELDS = """- name: the list of names for each scientist mentioned in the text
51
+ - field_of_study: a list with the field of study for each scientist"""
52
+ SPLIT_PROPOSER_EXAMPLE_CONTEXT = """{{
53
+ "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, born December 10, 1815 was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation."
54
+ }}"""
55
+ SPLIT_PROPOSER_EXAMPLE_ANSWER = """the text passage mentions the scientists "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and "Charles Babbage", both of whom were mathematicians. Therefore, the name output should be ["Augusta Ada King", "Charles Babbage"] and the field_of_study output should be ["Mathematician", "Mathematician"]."""