palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.21.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,306 @@
1
+ import json
2
+ import time
3
+
4
+ import litellm
5
+ from colorama import Fore, Style
6
+
7
+ from palimpzest.constants import MODEL_CARDS, Cardinality, Model, PromptStrategy
8
+ from palimpzest.core.elements.records import DataRecord
9
+ from palimpzest.core.models import GenerationStats
10
+ from palimpzest.prompts import (
11
+ FLAT_MAP_IMAGE_VALIDATOR_PROMPT,
12
+ FLAT_MAP_VALIDATOR_PROMPT,
13
+ MAP_IMAGE_VALIDATOR_PROMPT,
14
+ MAP_VALIDATOR_PROMPT,
15
+ RETRIEVE_VALIDATOR_PROMPT,
16
+ PromptFactory,
17
+ )
18
+ from palimpzest.query.generators.generators import get_json_from_answer
19
+ from palimpzest.query.operators.convert import LLMConvert
20
+ from palimpzest.query.operators.filter import LLMFilter
21
+ from palimpzest.query.operators.join import JoinOp
22
+ from palimpzest.query.operators.retrieve import RetrieveOp
23
+
24
+
25
+ class Validator:
26
+ """
27
+ The Validator is used during optimization to score the output of physical operator(s) and physical plan(s).
28
+
29
+ TODO: support end-to-end labels; will likely require a different SentinelExecutionStrategy which
30
+ executes the full input to produce an output, evaluates the output, and then updates
31
+ intermediate operator(s) based on the evaluation.
32
+ """
33
+ def __init__(self, model: Model = Model.o4_MINI):
34
+ self.model = model
35
+ self.filter_cache = {}
36
+ self.join_cache = {}
37
+
38
+ def map_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
39
+ raise NotImplementedError("Validator.map_score_fn not implemented.")
40
+
41
+ def flat_map_score_fn(self, fields: list[str], input_record: dict, output: list[dict]) -> float | None:
42
+ raise NotImplementedError("Validator.flat_map_score_fn not implemented.")
43
+
44
+ def filter_score_fn(self, filter_str: str, input_record: dict, output: bool) -> float | None:
45
+ raise NotImplementedError("Validator.filter_score_fn not implemented.")
46
+
47
+ def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
48
+ raise NotImplementedError("Validator.join_score_fn not implemented.")
49
+
50
+ def retrieve_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
51
+ raise NotImplementedError("Validator.map_score_fn not implemented.")
52
+
53
+ def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
54
+ """
55
+ Extract generation stats from the given completion response.
56
+ """
57
+ usage = completion.usage.model_dump()
58
+
59
+ # get cost per input/output token for the model and parse number of input and output tokens
60
+ usd_per_input_token = MODEL_CARDS[self.model.value]["usd_per_input_token"]
61
+ usd_per_output_token = MODEL_CARDS[self.model.value]["usd_per_output_token"]
62
+ input_tokens = usage["prompt_tokens"]
63
+ output_tokens = usage["completion_tokens"]
64
+
65
+ return GenerationStats(
66
+ model_name=self.model.value,
67
+ llm_call_duration_secs=time.time() - start_time,
68
+ fn_call_duration_secs=0.0,
69
+ total_input_tokens=input_tokens,
70
+ total_output_tokens=output_tokens,
71
+ total_input_cost=input_tokens * usd_per_input_token,
72
+ total_output_cost=output_tokens * usd_per_output_token,
73
+ cost_per_record=input_tokens * usd_per_input_token + output_tokens * usd_per_output_token,
74
+ total_llm_calls=1,
75
+ )
76
+
77
+ def _default_map_score_fn(self, op: LLMConvert, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
78
+ """
79
+ Compute the quality of the generated output for the given fields and input_record.
80
+ """
81
+ # create prompt factory
82
+ prompt_strategy = PromptStrategy.COT_QA_IMAGE if op.is_image_conversion() else PromptStrategy.COT_QA
83
+ factory = PromptFactory(prompt_strategy, Model.o4_MINI, Cardinality.ONE_TO_ONE) # TODO: switch to o4_MINI after merging in dev
84
+
85
+ # get the input messages; strip out the system message(s)
86
+ msg_kwargs = {"output_schema": op.output_schema, "project_cols": op.get_input_fields()}
87
+ messages = factory.create_messages(input_record, fields, **msg_kwargs)
88
+ input_messages = [msg for msg in messages if msg["role"] != "system"]
89
+ output = json.dumps(output, indent=2)
90
+ output_message = f"OUTPUT:\n--------\n{output}\n\nEVALUATION: "
91
+ input_str = '\n'.join(list(map(lambda d: d['content'], input_messages + [{"role": "user", "content": output_message}])))
92
+
93
+ # invoke the judge
94
+ score, gen_stats = None, GenerationStats()
95
+ try:
96
+ start_time = time.time()
97
+ validator_prompt = MAP_IMAGE_VALIDATOR_PROMPT if op.is_image_conversion() else MAP_VALIDATOR_PROMPT
98
+ val_messages = [{"role": "system", "content": validator_prompt}] + input_messages + [{"role": "user", "content": output_message}]
99
+ completion = litellm.completion(model="openai/o4-mini", messages=val_messages)
100
+ completion_text = completion.choices[0].message.content
101
+ gen_stats = self._get_gen_stats_from_completion(completion, start_time)
102
+ print(f"INPUT:\n{input_str}")
103
+ print(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)
104
+
105
+ # parse the evaluation
106
+ eval_dict: dict = get_json_from_answer(completion_text, Model.o4_MINI, Cardinality.ONE_TO_ONE)
107
+ score = sum(eval_dict.values()) / len(eval_dict)
108
+
109
+ except Exception:
110
+ pass
111
+
112
+ return score, gen_stats
113
+
114
+ def _default_flat_map_score_fn(self, op: LLMConvert, fields: list[str], input_record: dict, output: list[dict]) -> tuple[float | None, GenerationStats]:
115
+ """
116
+ Compute the quality for each record_op_stats object in the given record_set.
117
+ """
118
+ # create prompt factory
119
+ prompt_strategy = PromptStrategy.COT_QA_IMAGE if op.is_image_conversion() else PromptStrategy.COT_QA
120
+ factory = PromptFactory(prompt_strategy, Model.o4_MINI, Cardinality.ONE_TO_MANY) # TODO: switch to o4_MINI after merging in dev
121
+
122
+ # get the input messages; strip out the system message(s)
123
+ msg_kwargs = {"output_schema": op.output_schema, "project_cols": op.get_input_fields()}
124
+ messages = factory.create_messages(input_record, fields, **msg_kwargs)
125
+ input_messages = [msg for msg in messages if msg["role"] != "system"]
126
+ output = json.dumps(output, indent=2)
127
+ output_message = f"OUTPUTS:\n--------\n{output}\n\nEVALUATION: "
128
+ # input_str = '\n'.join(list(map(lambda d: d['content'], input_messages + [{"role": "user", "content": output_message}])))
129
+
130
+ # invoke the judge
131
+ score, gen_stats = None, GenerationStats()
132
+ try:
133
+ start_time = time.time()
134
+ validator_prompt = FLAT_MAP_IMAGE_VALIDATOR_PROMPT if op.is_image_conversion() else FLAT_MAP_VALIDATOR_PROMPT
135
+ val_messages = [{"role": "system", "content": validator_prompt}] + input_messages + [{"role": "user", "content": output_message}]
136
+ completion = litellm.completion(model="openai/o4-mini", messages=val_messages)
137
+ completion_text = completion.choices[0].message.content
138
+ gen_stats = self._get_gen_stats_from_completion(completion, start_time)
139
+ # print(f"INPUT:\n{input_str}")
140
+ # print(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)
141
+
142
+ # parse the evaluation
143
+ eval_dicts: list[dict] = get_json_from_answer(completion_text, Model.o4_MINI, Cardinality.ONE_TO_MANY)
144
+ all_qualities = []
145
+ for record_eval_dict in eval_dicts:
146
+ all_qualities.extend(record_eval_dict.values())
147
+ score = sum(all_qualities) / len(all_qualities)
148
+
149
+ except Exception:
150
+ pass
151
+
152
+ return score, gen_stats
153
+
154
+ def _default_filter_score_fn(self, op: LLMFilter, filter_str: str, input_record: dict, output: bool) -> tuple[float | None, GenerationStats]:
155
+ """
156
+ Compute the quality for each record_op_stats object in the given record_set.
157
+ """
158
+ score, gen_stats = None, GenerationStats()
159
+ filter_input_hash = hash(f"{filter_str}{hash(input_record)}")
160
+ label = self.filter_cache.get(filter_input_hash, None)
161
+ if label is None:
162
+ validator_op: LLMFilter = op.copy()
163
+ validator_op.model = Model.o4_MINI
164
+ try:
165
+ target_record_set = validator_op(input_record)
166
+ label = target_record_set[0].passed_operator
167
+ self.filter_cache[filter_input_hash] = label
168
+ score = label == output
169
+ record_op_stats = target_record_set.record_op_stats[0]
170
+ gen_stats = GenerationStats(
171
+ model_name=self.model.value,
172
+ total_input_tokens=record_op_stats.total_input_tokens,
173
+ total_output_tokens=record_op_stats.total_output_tokens,
174
+ total_input_cost=record_op_stats.total_input_cost,
175
+ total_output_cost=record_op_stats.total_output_cost,
176
+ cost_per_record=record_op_stats.cost_per_record,
177
+ llm_call_duration_secs=record_op_stats.llm_call_duration_secs,
178
+ fn_call_duration_secs=record_op_stats.fn_call_duration_secs,
179
+ total_llm_calls=record_op_stats.total_llm_calls,
180
+ )
181
+
182
+ except Exception:
183
+ pass
184
+
185
+ else:
186
+ score = label == output
187
+
188
+ return score, gen_stats
189
+
190
+ def _default_join_score_fn(self, op: JoinOp, condition: str, left_input_record: DataRecord, right_input_record: DataRecord, output: bool) -> tuple[float | None, GenerationStats]:
191
+ score, gen_stats = None, GenerationStats()
192
+ join_input_hash = hash(f"{condition}{hash(left_input_record)}{hash(right_input_record)}")
193
+ label = self.join_cache.get(join_input_hash, None)
194
+ if label is None:
195
+ validator_op: JoinOp = op.copy()
196
+ validator_op.model = Model.o4_MINI
197
+ try:
198
+ target_record_set = validator_op([left_input_record], [right_input_record])
199
+ label = target_record_set[0].passed_operator
200
+ self.join_cache[join_input_hash] = label
201
+ score = label == output
202
+ record_op_stats = target_record_set.record_op_stats[0]
203
+ gen_stats = GenerationStats(
204
+ model_name=self.model.value,
205
+ total_input_tokens=record_op_stats.total_input_tokens,
206
+ total_output_tokens=record_op_stats.total_output_tokens,
207
+ total_input_cost=record_op_stats.total_input_cost,
208
+ total_output_cost=record_op_stats.total_output_cost,
209
+ cost_per_record=record_op_stats.cost_per_record,
210
+ llm_call_duration_secs=record_op_stats.llm_call_duration_secs,
211
+ fn_call_duration_secs=record_op_stats.fn_call_duration_secs,
212
+ total_llm_calls=record_op_stats.total_llm_calls,
213
+ )
214
+
215
+ except Exception:
216
+ pass
217
+
218
+ else:
219
+ score = label == output
220
+
221
+ return score, gen_stats
222
+
223
+ def _default_retrieve_score_fn(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
224
+ """
225
+ Compute the quality of the generated output for the given fields and input_record.
226
+ """
227
+ # TODO: retrieve k=25; score each item based on relevance; compute F1
228
+ # TODO: support retrieval over images
229
+ # create prompt factory
230
+ factory = PromptFactory(PromptStrategy.COT_QA, Model.o4_MINI, Cardinality.ONE_TO_ONE) # TODO: switch to o4_MINI after merging in dev
231
+
232
+ # get the input messages; strip out the system message(s)
233
+ msg_kwargs = {"output_schema": op.output_schema, "project_cols": op.get_input_fields()}
234
+ messages = factory.create_messages(input_record, fields, **msg_kwargs)
235
+ input_messages = [msg for msg in messages if msg["role"] != "system"]
236
+ output = json.dumps(output, indent=2)
237
+ output_message = f"OUTPUT:\n--------\n{output}\n\nEVALUATION: "
238
+ input_str = '\n'.join(list(map(lambda d: d['content'], input_messages + [{"role": "user", "content": output_message}])))
239
+
240
+ # invoke the judge
241
+ score, gen_stats = None, GenerationStats()
242
+ try:
243
+ start_time = time.time()
244
+ # TODO: support retrieval over images
245
+ validator_prompt = RETRIEVE_VALIDATOR_PROMPT
246
+ val_messages = [{"role": "system", "content": validator_prompt}] + input_messages + [{"role": "user", "content": output_message}]
247
+ completion = litellm.completion(model="openai/o4-mini", messages=val_messages)
248
+ completion_text = completion.choices[0].message.content
249
+ gen_stats = self._get_gen_stats_from_completion(completion, start_time)
250
+ print(f"INPUT:\n{input_str}")
251
+ print(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)
252
+
253
+ # parse the evaluation
254
+ eval_dict: dict = get_json_from_answer(completion_text, Model.o4_MINI, Cardinality.ONE_TO_ONE)
255
+ score = sum(eval_dict.values()) / len(eval_dict)
256
+
257
+ except Exception:
258
+ pass
259
+
260
+ return score, gen_stats
261
+
262
+
263
+ def _score_map(self, op: LLMConvert, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
264
+ try:
265
+ out = self.map_score_fn(fields, input_record.to_dict(), output)
266
+ score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
267
+ return score, gen_stats, full_hash
268
+ except NotImplementedError:
269
+ score, gen_stats = self._default_map_score_fn(op, fields, input_record, output)
270
+ return score, gen_stats, full_hash
271
+
272
+ def _score_flat_map(self, op: LLMConvert, fields: list[str], input_record: DataRecord, output: list[dict], full_hash: str) -> tuple[float | None, GenerationStats, str]:
273
+ try:
274
+ out = self.flat_map_score_fn(fields, input_record.to_dict(), output)
275
+ score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
276
+ return score, gen_stats, full_hash
277
+ except NotImplementedError:
278
+ score, gen_stats = self._default_flat_map_score_fn(op, fields, input_record, output)
279
+ return score, gen_stats, full_hash
280
+
281
+ def _score_filter(self, op: LLMFilter, filter_str: str, input_record: DataRecord, output: bool, full_hash: str) -> tuple[float | None, GenerationStats, str]:
282
+ try:
283
+ out = self.filter_score_fn(filter_str, input_record.to_dict(), output)
284
+ score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
285
+ return score, gen_stats, full_hash
286
+ except NotImplementedError:
287
+ score, gen_stats = self._default_filter_score_fn(op, filter_str, input_record, output)
288
+ return score, gen_stats, full_hash
289
+
290
+ def _score_join(self, op: JoinOp, condition: str, left_input_record: DataRecord, right_input_record: DataRecord, output: bool, full_hash: str) -> tuple[float | None, GenerationStats, str]:
291
+ try:
292
+ out = self.join_score_fn(condition, left_input_record.to_dict(), right_input_record.to_dict(), output)
293
+ score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
294
+ return score, gen_stats, full_hash
295
+ except NotImplementedError:
296
+ score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
297
+ return score, gen_stats, full_hash
298
+
299
+ def _score_retrieve(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
300
+ try:
301
+ out = self.retrieve_score_fn(fields, input_record.to_dict(), output)
302
+ score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
303
+ return score, gen_stats, full_hash
304
+ except NotImplementedError:
305
+ score, gen_stats = self._default_retrieve_score_fn(op, fields, input_record, output)
306
+ return score, gen_stats, full_hash
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.7.21
3
+ Version: 0.8.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -15,12 +15,14 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Requires-Python: >=3.8
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Requires-Dist: anthropic>=0.55.0
18
19
  Requires-Dist: beautifulsoup4>=4.13.4
19
20
  Requires-Dist: chromadb>=1.0.15
20
21
  Requires-Dist: colorama>=0.4.6
21
22
  Requires-Dist: datasets>=4.0.0
22
23
  Requires-Dist: fastapi~=0.115.0
23
24
  Requires-Dist: gradio>=5.26.0
25
+ Requires-Dist: litellm>=1.73.1
24
26
  Requires-Dist: numpy==2.0.2
25
27
  Requires-Dist: openai>=1.0
26
28
  Requires-Dist: pandas>=2.1.1
@@ -37,6 +39,7 @@ Requires-Dist: requests>=2.25
37
39
  Requires-Dist: ruff>=0.9.0
38
40
  Requires-Dist: sentence-transformers==5.0.0
39
41
  Requires-Dist: setuptools>=70.1.1
42
+ Requires-Dist: smolagents[toolkit]
40
43
  Requires-Dist: tabulate>=0.9.0
41
44
  Requires-Dist: together>=1.5.5
42
45
  Requires-Dist: tqdm~=4.66.1
@@ -46,6 +49,8 @@ Requires-Dist: mkdocs>=1.6.1; extra == "docs"
46
49
  Requires-Dist: mkdocs-material>=9.6.3; extra == "docs"
47
50
  Requires-Dist: mkdocstrings-python>=1.15.0; extra == "docs"
48
51
  Requires-Dist: mkdocs-material[imaging]; extra == "docs"
52
+ Provides-Extra: vllm
53
+ Requires-Dist: vllm>=0.10.1.1; extra == "vllm"
49
54
  Dynamic: license-file
50
55
 
51
56
  ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
@@ -0,0 +1,95 @@
1
+ palimpzest/__init__.py,sha256=1PzadDDOVMQJKNEYUH0_tw8tQKUYTT31M0vuzTr2Rqk,1694
2
+ palimpzest/constants.py,sha256=1xGydUfkuVtaeoQ_Ku6P5PDLAelQKAVouivdXkva-zE,21109
3
+ palimpzest/policy.py,sha256=lIvw_C_rmwCH4LZaeNkAuixl8zw9RAW_JcSWSHPjKyc,11628
4
+ palimpzest/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ palimpzest/agents/compute_agents.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ palimpzest/agents/search_agents.py,sha256=t2QMreB5Ph71aoNk5bBtV-0l8im79z-pMAR3JDAySDw,29418
7
+ palimpzest/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ palimpzest/core/models.py,sha256=fLO4T7x0njNeEbUpbhJm9cdnBva0y0Zw5WGBGdzdS_I,42442
9
+ palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ palimpzest/core/data/context.py,sha256=x1xYyu9qW65dvtK_XayIfv_CgsCEPW6Qe0DTiSf9sjU,16207
11
+ palimpzest/core/data/context_manager.py,sha256=8hAKWD2jhFZgghTu7AYgjkvKDsJUPVxq8g4nG0HWvfo,6150
12
+ palimpzest/core/data/dataset.py,sha256=vqEEMxaG157jdyzUxM_tLt5Xq_49Yq-0dVGhS0ZUiHA,27904
13
+ palimpzest/core/data/index_dataset.py,sha256=adO67DgzHhA4lBME0-h4SjXfdz9UcNMSDGXTpUdKbgE,1929
14
+ palimpzest/core/data/iter_dataset.py,sha256=u7eZNWWT84rH_D8LNIuq0NAnm2roX81ifKTYp-hwY7g,20512
15
+ palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ palimpzest/core/elements/filters.py,sha256=fU2x0eWDwfP52_5fUmqJXTuhs4H0vvHtPZLdA3IIw8I,1642
17
+ palimpzest/core/elements/groupbysig.py,sha256=Fcbt1GSAkxIILS5me0sWaNMLiptLJ6NIY6BIC0S-g3k,2318
18
+ palimpzest/core/elements/records.py,sha256=Su4d8GRNp5-Q1nSWANm-jehyw53tHUl20iGluyEk8NI,19053
19
+ palimpzest/core/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ palimpzest/core/lib/schemas.py,sha256=0qauaG3uW5tCJXNAo1i0G0UgbTaQLSLT6GoNDX8494Q,8376
21
+ palimpzest/prompts/__init__.py,sha256=sdZbC8RWi_IGjFuzKQMdRjS2Ih4zQnkyzFoJ6Q3Ce70,1764
22
+ palimpzest/prompts/agent_prompts.py,sha256=CUzBVLBiPSw8OShtKp4VTpQwtrNMtcMglo-IZHMvuDM,17459
23
+ palimpzest/prompts/context_search.py,sha256=s3pti4XNRiIyiWzjVNL_NqmqEc31jzSKMF2SlN0Aaf8,357
24
+ palimpzest/prompts/convert_prompts.py,sha256=FR_zUADuOWxMqZED4S0lyO9VNgKPNiVpSZv6ND7a0v4,6009
25
+ palimpzest/prompts/critique_and_refine_convert_prompts.py,sha256=WoXExBxQ7twswd9VCCST26c-2ehZtpD2iQoBi7sqDnQ,7814
26
+ palimpzest/prompts/filter_prompts.py,sha256=lYQFrpAKhOMUQDOVbRBHh7IjuUNMCmBnAqHwDuptQHI,4232
27
+ palimpzest/prompts/join_prompts.py,sha256=viQVvOpa2l9PYM34ua_jPNZnUOU_eCTMIoabBkF5cVc,5929
28
+ palimpzest/prompts/moa_aggregator_convert_prompts.py,sha256=BQRrtGdr53PTqvXzmFh8kfQ_w9KoKw-zTtmdo-8RFjo,2887
29
+ palimpzest/prompts/moa_proposer_convert_prompts.py,sha256=35pxtR2hnjLkv_10VEetRR9qUCR-zD85NZF3BaAANDk,3462
30
+ palimpzest/prompts/prompt_factory.py,sha256=FDBoVdJ_khT7t6T6WAiK6RgC7HqB3efmRkwMam3AIhM,51262
31
+ palimpzest/prompts/split_merge_prompts.py,sha256=0mTZeJhxtvlmv-ro0KwQpxlGgSTwyUhGRHJ-uHk2Zlw,3146
32
+ palimpzest/prompts/split_proposer_prompts.py,sha256=TBHLGaM_ycHjGHrp1JziJoJDw4S5_F4afKSAdt2McKk,2624
33
+ palimpzest/prompts/util_phrases.py,sha256=NWrcHfjJyiOY16Jyt7R50moVnlJDyvSBZ9kBqyX2WQo,751
34
+ palimpzest/prompts/validator.py,sha256=pJTZjlt_OiFM3IFOgsJ0jQdayra8iRVrpqENlXI9tQQ,10532
35
+ palimpzest/query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ palimpzest/query/execution/all_sample_execution_strategy.py,sha256=3n2hl8m-WFWIu-a8DiSVsGkz4ej3yB7mSdFR0jsiwAU,14366
38
+ palimpzest/query/execution/execution_strategy.py,sha256=KwBJbWOBOOPBiWRm3ypHcAQiWbCsvtW6UnVU4tHkYz8,18905
39
+ palimpzest/query/execution/execution_strategy_type.py,sha256=vRQBPCQN5_aoyD3TLIeW3VPo15mqF-5RBvEXkENz9FE,987
40
+ palimpzest/query/execution/mab_execution_strategy.py,sha256=LY1JlbYMsnJHCtYjaJ6iklojBqXc2B4KS62lobPFNz0,42341
41
+ palimpzest/query/execution/parallel_execution_strategy.py,sha256=Gn5hB5XddX2jCkxx6d7O-DmitK6fbuwBFnnyKhnGYEw,15706
42
+ palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=1eo-Z9G3u92_PjoSX8HmO3D3phYgA8f0Actbgd1-oKY,16247
43
+ palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
+ palimpzest/query/generators/generators.py,sha256=pi6gTCzQYs-z93IFNGKyoskIcdYCSnOwyaj-DvSlkb4,20877
45
+ palimpzest/query/operators/__init__.py,sha256=j-yh0P5tzXGa0JU_g8aNn54wCJDXPCMbmtOmazXXEts,3459
46
+ palimpzest/query/operators/aggregate.py,sha256=QvWr4C1arFSZWVqPSF5F5On6Ise5OF3VVWNGSq6Gfjk,11230
47
+ palimpzest/query/operators/compute.py,sha256=bxMKLRU_o7v603daKeR0FayDZ_V6NLI1fGzgu6E-sac,8473
48
+ palimpzest/query/operators/convert.py,sha256=teesuAeYl20ULwm6LIA277SZremdHedD2N2GYDUjb5E,17156
49
+ palimpzest/query/operators/critique_and_refine_convert.py,sha256=nJOQf7RLJR5Acg7fPssb0tTmtsCipG8hHu9PRquM9RE,5271
50
+ palimpzest/query/operators/distinct.py,sha256=MuF3NlC0QMTSGs0_fe2oly0I5Ow0hfOa7h8BFGhHiCs,2594
51
+ palimpzest/query/operators/filter.py,sha256=Wm1PaxURE1ZY5j7E1AitGdJfb_IKJoC_3qQW8aF0XC4,10703
52
+ palimpzest/query/operators/join.py,sha256=z1bzhdazTEq1BjoUSwV6j_DQ84TJ3uaSZJpCzSP61nc,17727
53
+ palimpzest/query/operators/limit.py,sha256=upJ775cGkxjFHRJm8GpSvtJN1cspg2FVYLN_MrIfUo4,2113
54
+ palimpzest/query/operators/logical.py,sha256=rh3XBUVO1JAEijw9AHjU35uf5ag01-KONdpCHJXRs3M,19883
55
+ palimpzest/query/operators/mixture_of_agents_convert.py,sha256=Y6O9-zL_6BPwl5Yix3SyYhI_68wiejOtJ3xuFcn_dbs,6731
56
+ palimpzest/query/operators/physical.py,sha256=buPZjtP4HKNVfOCNWdBtDnRS217dSsIG74gqZ1jmoyo,8320
57
+ palimpzest/query/operators/project.py,sha256=RX5SbHFRwHcMfiQRofIPQr-AHgIDYm68ifiFZAPu7Fo,2094
58
+ palimpzest/query/operators/rag_convert.py,sha256=1QQGrE22-Ec3-MNbnaU3k4TGHdpi2qZqZR9MHUniEM4,10691
59
+ palimpzest/query/operators/retrieve.py,sha256=v1FTFsSctqH4B37aWgBXYIxgOMJwRWQ2kwwXu1huwaQ,13106
60
+ palimpzest/query/operators/scan.py,sha256=Da_EZUrArzlAameHYCmtqo-xbPOFvbTYSktrUcUEUSc,7398
61
+ palimpzest/query/operators/search.py,sha256=xydO5Kni0RArpvLSn2ajzD4TcH442VjpP2x9NakjzaA,22842
62
+ palimpzest/query/operators/split_convert.py,sha256=SgtkwGWnIFlQTk96NsgckRx5q15KaGpsF3Si0FzHEGo,7765
63
+ palimpzest/query/optimizer/__init__.py,sha256=L2E1rOA-8O9oH6JL56wLI1qUVxXBLubJEG1IHMH-HU4,2384
64
+ palimpzest/query/optimizer/cost_model.py,sha256=OldPy-TJdfsQbYRoKlb3yWeKbi15jcldTIUS6BTi9T8,12678
65
+ palimpzest/query/optimizer/optimizer.py,sha256=mgM6c0d_voGNun2hMzqjfumJVieACtcHsNnBP4LyXAA,19626
66
+ palimpzest/query/optimizer/optimizer_strategy.py,sha256=9YlNGkqwgX0WaV6y8tKOOHVN8kC8GjDI3DttvGW5SYY,10206
67
+ palimpzest/query/optimizer/optimizer_strategy_type.py,sha256=V-MMHvJdnfZKoUX1xxxwh66q1RjN2FL35IsiT1C62c8,1084
68
+ palimpzest/query/optimizer/plan.py,sha256=VIhN7tWT7EoRE9BKYa1qvvOhX7dEaM-aiobByX0qjzg,22900
69
+ palimpzest/query/optimizer/primitives.py,sha256=jMMVq37y1tWiPU1lSSKQP9OP-mzkpSxSmUeDajRYYOQ,5445
70
+ palimpzest/query/optimizer/rules.py,sha256=9AsuVjhiZUc0snQPNhIqeyKpmqFsSv7e-v6BEbp9CDw,43315
71
+ palimpzest/query/optimizer/tasks.py,sha256=DJcKDNbVJox61rnTW0HgT1PtxGx2P_NiLvNroXie-Lg,29509
72
+ palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
+ palimpzest/query/processor/config.py,sha256=b_EQOqOXoRP6AziOw1iLqb8tlSWP-D1_el3mmrnBDAk,2263
74
+ palimpzest/query/processor/query_processor.py,sha256=W01-2FocN1Jsv58gmEo5ALTIcpLt7D0dmI8kghSCdBk,6291
75
+ palimpzest/query/processor/query_processor_factory.py,sha256=H_2pkcN_aVbNDuMLsvZP2PXARLF9MwoHGAzEWkSNNYM,7866
76
+ palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ palimpzest/schemabuilder/schema_builder.py,sha256=QraGp66dcD-ej6Y2mER40o86G9JqlBkL7swkJzjUAIY,7968
78
+ palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
79
+ palimpzest/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ palimpzest/tools/allenpdf.py,sha256=fXMOmSDdSSLXDKAPYYJ8k4egtWEBf_Me9Lq9tM3iyoA,1690
81
+ palimpzest/tools/pdfparser.py,sha256=0DOVUZLxYfqjxM8WNEfYcyiXb1qW9BWVIHEB_B_YhWA,9570
82
+ palimpzest/tools/skema_tools.py,sha256=HXUFpjMhbVxZwKKkATeK-FwtlTCawaCbeP-uHntI1Kg,669
83
+ palimpzest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
+ palimpzest/utils/env_helpers.py,sha256=n81KzoJ459pRxo7QmJA7duazwWsfoMGTHc71D2LatFk,334
85
+ palimpzest/utils/hash_helpers.py,sha256=3A8dA7SbXTwnnvZvPVNqqMLlVRhCKyKF_bjNNAu3Exk,334
86
+ palimpzest/utils/model_helpers.py,sha256=Vlu3KIvbc4Usg4iSI2KMFSc-qcdAubWN2CSjZod2czY,2233
87
+ palimpzest/utils/progress.py,sha256=7gucyZr82udMDZitrrkAOSKHZVljE3R2wv9nf5gA5TM,20807
88
+ palimpzest/utils/udfs.py,sha256=LjHic54B1az-rKgNLur0wOpaz2ko_UodjLEJrazkxvY,1854
89
+ palimpzest/validator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
+ palimpzest/validator/validator.py,sha256=J2tGvJqfg6v5lOQDYYaqAa9d37uVHBrqkNs-a8d1Ic0,16365
91
+ palimpzest-0.8.0.dist-info/licenses/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
92
+ palimpzest-0.8.0.dist-info/METADATA,sha256=MUkUorsKFMVGPmCeAZOBruvKP8shJ1kbF5kulxPnSHc,7286
93
+ palimpzest-0.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ palimpzest-0.8.0.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
95
+ palimpzest-0.8.0.dist-info/RECORD,,
@@ -1,141 +0,0 @@
1
- from __future__ import annotations
2
-
3
-
4
- class Field:
5
- """
6
- A Field is defined by its description and its type. The Field class is subclassed to specify
7
- that values of the subclass should belong to a specific type.
8
-
9
- For example, if you wanted to define Fields relevant to indexing research papers, you could define a field
10
- representing the title of a paper, the year it was published, and the journal it was published in:
11
-
12
- ```python
13
- paper_title = Field(desc="The title of a scientific paper")
14
- paper_year = Field(desc="The year the paper was published")
15
- paper_journal = Field(desc="The name of the journal that published the paper")
16
- ```
17
- """
18
- is_image_field = False
19
-
20
- def __init__(self, desc: str = "") -> None:
21
- self._desc = desc
22
- self.type = None
23
-
24
- def __str__(self) -> str:
25
- return f"{self.__class__.__name__}(desc={self._desc})"
26
-
27
- def __hash__(self) -> int:
28
- return hash(self._desc + self.__class__.__name__)
29
-
30
- def __eq__(self, other) -> bool:
31
- return self._desc == other._desc and self.__class__ == other.__class__
32
-
33
- @property
34
- def desc(self) -> str:
35
- return self._desc
36
-
37
- def json_schema(self) -> dict:
38
- return {"description": self._desc, "type": str(self.type)}
39
-
40
-
41
- class BooleanField(Field):
42
- """A BooleanField is a Field that is True or False."""
43
-
44
- def __init__(self, desc: str):
45
- super().__init__(desc=desc)
46
- self.type = bool
47
-
48
-
49
- class BytesField(Field):
50
- """A BytesField is a Field that is definitely an array of bytes."""
51
-
52
- def __init__(self, desc: str):
53
- super().__init__(desc=desc)
54
- self.type = bytes
55
-
56
- def json_schema(self) -> dict[str, str]:
57
- return {
58
- "description": self._desc,
59
- "type": str(self.type),
60
- "contentEncoding": "base64",
61
- "contentMediaType": "application/octet-stream",
62
- }
63
-
64
-
65
- class CallableField(Field):
66
- """A CallableField is a Field that stores a function."""
67
-
68
- def __init__(self, desc: str):
69
- super().__init__(desc=desc)
70
- self.type = type(lambda x: x)
71
-
72
-
73
- class FloatField(Field):
74
- """A FloatField is a Field that is definitely an integer or a float."""
75
-
76
- def __init__(self, desc: str):
77
- super().__init__(desc=desc)
78
- self.type = float
79
-
80
-
81
- class IntField(Field):
82
- """An IntField is a Field that is definitely an integer or a float."""
83
-
84
- def __init__(self, desc: str):
85
- super().__init__(desc=desc)
86
- self.type = int
87
-
88
-
89
- class NumericField(Field):
90
- """A NumericField is a Field that is definitely a number."""
91
-
92
- def __init__(self, desc: str):
93
- super().__init__(desc=desc)
94
- self.type = int | float
95
-
96
-
97
- class StringField(Field):
98
- """A StringField is a Field that is definitely a string of text."""
99
-
100
- def __init__(self, desc: str):
101
- super().__init__(desc=desc)
102
- self.type = str
103
-
104
-
105
- class ImageFilepathField(StringField):
106
- """An ImageFilepathField is a StringField that contains the filepath to an image."""
107
- is_image_field = True
108
-
109
- def __init__(self, desc: str):
110
- super().__init__(desc=desc)
111
-
112
-
113
- class ImageURLField(StringField):
114
- """An ImageURLField is a StringField that contains the publicly accessible URL for an image."""
115
- is_image_field = True
116
-
117
- def __init__(self, desc: str):
118
- super().__init__(desc=desc)
119
-
120
-
121
- class ImageBase64Field(BytesField):
122
- """An ImageBase64Field is a BytesField that contains a base64 encoded image."""
123
- is_image_field = True
124
-
125
- def __init__(self, desc: str):
126
- super().__init__(desc=desc)
127
-
128
-
129
- ### fields which are metaclasses that produce other field types ###
130
- class ListField(Field):
131
- """A field representing a list of elements of specified types, with full list functionality."""
132
-
133
- def __new__(cls, element_type: Field, desc: str | None = None):
134
- attrs = {
135
- "element_type": element_type,
136
- "is_image_field": element_type.is_image_field,
137
- "type": list,
138
- "_desc": desc,
139
- }
140
-
141
- return type(f"List[{element_type.__name__}]", (Field,), attrs)
@@ -1,28 +0,0 @@
1
- ### CODE SYNTHESIS PROMPTS ###
2
- EXAMPLE_PROMPT = """Example{idx}:
3
- Example Input
4
- -------------
5
- {example_inputs}
6
-
7
- Example Output
8
- --------------
9
- {example_output}
10
- """
11
-
12
- CODEGEN_PROMPT = """You are a helpful programming assistant and an expert {language} programmer. Implement the {language} function `{api}` that extracts `{output}` ({output_desc}) from given inputs:
13
- {inputs_desc}
14
- {examples_desc}
15
- Notice that the evaluation will severely punish incorrect outputs. Thus, when the function is uncertain, it should return `None` to abstain instead of returning an incorrect guess.
16
- {advice}
17
- Return the implementation only."""
18
-
19
- ADVICEGEN_PROMPT = """You are a helpful programming assistant and an expert {language} programmer. Your job is to provide programming ideas to help me write {language} programs.
20
- For example, if I want to complete a task: "extract the salary number (in USD) from a given employee's document", you can provide me with {n} different ways to do it like:
21
- Idea 1: Use regular expressions to extract the salary number: a number with a dollar sign in front of it. For example, $100,000.
22
- Idea 2: Find the table entry with the salary number.
23
- Idea 3: Use a pre-trained NLP model to extract the salary number.
24
- #
25
- Now, consider the following {language} programming task that extracts `{output}` ({output_desc}) from given inputs:
26
- {examples_desc}
27
- Please provide me with {n} different ideas to complete this task. Return the ideas only, following the format above.
28
- """