palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from abc import ABC
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
from smolagents import CodeAgent, LiteLLMModel
|
|
10
|
+
|
|
11
|
+
from palimpzest.core.data import context_manager
|
|
12
|
+
from palimpzest.core.data.dataset import Dataset
|
|
13
|
+
from palimpzest.core.lib.schemas import create_schema_from_fields, union_schemas
|
|
14
|
+
from palimpzest.query.operators.logical import ComputeOperator, ContextScan, LogicalOperator, SearchOperator
|
|
15
|
+
from palimpzest.utils.hash_helpers import hash_for_id
|
|
16
|
+
|
|
17
|
+
PZ_INSTRUCTION = """\n\nYou are a CodeAgent who is a specialist at writing declarative AI programs with the Palimpzest (PZ) library.
|
|
18
|
+
|
|
19
|
+
Palimpzest is a programming framework which provides you with **semantic operators** (e.g. semantic maps, semantic filters, etc.)
|
|
20
|
+
which are like their traditional counterparts, except they can execute instructions provided in natural language.
|
|
21
|
+
|
|
22
|
+
For example, if you wanted to write a program to extract the title and abstract from a directory of papers,
|
|
23
|
+
you could write the following in PZ:
|
|
24
|
+
```
|
|
25
|
+
import palimpzest as pz
|
|
26
|
+
from dotenv import load_dotenv
|
|
27
|
+
|
|
28
|
+
# Load environment variables from .env file
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
# Define columns for semantic map (sem_map) operation; each column is specified
|
|
32
|
+
# with a dictionary containing the following keys:
|
|
33
|
+
# - "name": the name of the field to compute
|
|
34
|
+
# - "type": the type of the field to compute
|
|
35
|
+
# - "description": the natural language description of the field
|
|
36
|
+
paper_cols = [
|
|
37
|
+
{"name": "title", "type": str, "description": "the title of the paper"},
|
|
38
|
+
{"name": "abstract", "type": str, "description": "the paper's abstract"},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
# construct the data processing pipeline with PZ
|
|
42
|
+
ds = pz.TextFileDataset(id="papers", path="path/to/papers")
|
|
43
|
+
ds = ds.sem_map(cols)
|
|
44
|
+
|
|
45
|
+
# optimize and execute the PZ program
|
|
46
|
+
validator = pz.Validator()
|
|
47
|
+
config = pz.QueryProcessorConfig(
|
|
48
|
+
policy=pz.MaxQuality(),
|
|
49
|
+
execution_strategy="parallel",
|
|
50
|
+
max_workers=20,
|
|
51
|
+
progress=True,
|
|
52
|
+
)
|
|
53
|
+
output = ds.optimize_and_run(config=config, validator=validator)
|
|
54
|
+
|
|
55
|
+
# write the execution stats to json
|
|
56
|
+
output.execution_stats.to_json("pz_program_stats.json")
|
|
57
|
+
|
|
58
|
+
# write the output to a CSV and print the output CSV filepath so the user knows where to find it
|
|
59
|
+
output_filepath = "pz_program_output.csv"
|
|
60
|
+
output.to_df().to_csv(output_filepath, index=False)
|
|
61
|
+
print(f"Results at: {output_filepath}")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
To initialize a dataset in PZ, simply provide the path to a directory to `pz.TextFileDirectory()`
|
|
65
|
+
(if your data contains text-based files). For example:
|
|
66
|
+
```
|
|
67
|
+
import palimpzest as pz
|
|
68
|
+
from dotenv import load_dotenv
|
|
69
|
+
|
|
70
|
+
# Load environment variables from .env file
|
|
71
|
+
load_dotenv()
|
|
72
|
+
|
|
73
|
+
ds = pz.TextFileDataset(id="files", path="path/to/files")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Palimpzest has two primary **semantic operators** which you can use to construct data processing pipelines:
|
|
77
|
+
- sem_filter(predicate: str): executes a semantic filter specified by the natural language predicate on a given PZ dataset
|
|
78
|
+
- sem_map(cols: list[dict]): executes a semantic map to compute the `cols` on a given PZ dataset
|
|
79
|
+
|
|
80
|
+
As a second example, consider the following PZ program which filters for papers about batteries that are from MIT
|
|
81
|
+
and computes a summary for each one:
|
|
82
|
+
```
|
|
83
|
+
import palimpzest as pz
|
|
84
|
+
from dotenv import load_dotenv
|
|
85
|
+
|
|
86
|
+
# Load environment variables from .env file
|
|
87
|
+
load_dotenv()
|
|
88
|
+
|
|
89
|
+
# construct the PZ program
|
|
90
|
+
ds = pz.TextFileDataset(id="papers", path="path/to/research-papers")
|
|
91
|
+
ds = ds.sem_filter("The paper is about batteries")
|
|
92
|
+
ds = ds.sem_filter("The paper is from MIT")
|
|
93
|
+
ds = ds.sem_map([{"name": "summary", "type": str, "description": "A summary of the paper"}])
|
|
94
|
+
|
|
95
|
+
# optimize and execute the PZ program
|
|
96
|
+
validator = pz.Validator()
|
|
97
|
+
config = pz.QueryProcessorConfig(
|
|
98
|
+
policy=pz.MaxQuality(),
|
|
99
|
+
execution_strategy="parallel",
|
|
100
|
+
max_workers=20,
|
|
101
|
+
progress=True,
|
|
102
|
+
)
|
|
103
|
+
output = ds.optimize_and_run(config=config, validator=validator)
|
|
104
|
+
|
|
105
|
+
# write the execution stats to json
|
|
106
|
+
output.execution_stats.to_json("pz_program_stats.json")
|
|
107
|
+
|
|
108
|
+
# write the output to a CSV and print the output CSV filepath so the user knows where to find it
|
|
109
|
+
output_filepath = "pz_program_output.csv"
|
|
110
|
+
output.to_df().to_csv(output_filepath, index=False)
|
|
111
|
+
print(f"Results at: {output_filepath}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Be sure to always:
|
|
115
|
+
- execute your program using the `.optimize_and_run()` format shown above
|
|
116
|
+
- call `output.execution_stats.to_json("pz_program_stats.json")` to write execution statistics to disk
|
|
117
|
+
- write your output to CSV and print where you wrote it!
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
class Context(Dataset, ABC):
|
|
121
|
+
"""
|
|
122
|
+
The `Context` class is an abstract base class for root `Datasets` whose data is accessed
|
|
123
|
+
via user-defined methods. Classes which inherit from this class must implement two methods:
|
|
124
|
+
|
|
125
|
+
- `list_filepaths()`: which lists the files that the `Context` has access to.
|
|
126
|
+
- `read_filepath(path: str)`: which reads the file corresponding to the given `path`.
|
|
127
|
+
|
|
128
|
+
A `Context` is a special type of `Dataset` that represents a view over an underlying `Dataset`.
|
|
129
|
+
Each `Context` has a `name` which uniquely identifies it, as well as a natural language `description`
|
|
130
|
+
of the data / computation that the `Context` represents. Similar to `Dataset`s, `Context`s can be
|
|
131
|
+
lazily transformed using functions such as `sem_filter`, `sem_map`, `sem_join`, etc., and they may
|
|
132
|
+
be materialized or unmaterialized.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
id: str,
|
|
138
|
+
description: str,
|
|
139
|
+
operator: LogicalOperator,
|
|
140
|
+
schema: type[BaseModel] | None = None,
|
|
141
|
+
sources: list[Context] | Context | None = None,
|
|
142
|
+
materialized: bool = False,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Constructor for the `Context` class.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
id (`str`): a string identifier for the `Context`
|
|
149
|
+
description (`str`): the description of the data contained within the `Context`
|
|
150
|
+
operator (`LogicalOperator`): The `LogicalOperator` used to compute this `Context`.
|
|
151
|
+
schema: (`type[BaseModel] | None`): The schema of this `Context`.
|
|
152
|
+
sources (`list[Context] | Context | None`): The (list of) `Context(s)` which are input(s) to
|
|
153
|
+
the operator used to compute this `Context`.
|
|
154
|
+
materialized (`bool`): True if the `Context` has been computed, False otherwise
|
|
155
|
+
"""
|
|
156
|
+
# set the description
|
|
157
|
+
self._description = description
|
|
158
|
+
|
|
159
|
+
# set the materialization status
|
|
160
|
+
self._materialized = materialized
|
|
161
|
+
|
|
162
|
+
# compute schema and call parent constructor
|
|
163
|
+
if schema is None:
|
|
164
|
+
schema = create_schema_from_fields([{"name": "context", "description": "The context", "type": str}])
|
|
165
|
+
super().__init__(sources=sources, operator=operator, schema=schema, id=id)
|
|
166
|
+
|
|
167
|
+
# set the tools associated with this Context
|
|
168
|
+
self._tools = [getattr(self, attr) for attr in dir(self) if attr.startswith("tool_")]
|
|
169
|
+
|
|
170
|
+
# add Context to ContextManager
|
|
171
|
+
cm = context_manager.ContextManager()
|
|
172
|
+
cm.add_context(self)
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def description(self) -> str:
|
|
176
|
+
"""The string containing all of the information computed for this `Context`"""
|
|
177
|
+
return self._description
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def materialized(self) -> bool:
|
|
181
|
+
"""The boolean which specifies whether the `Context` has been computed or not"""
|
|
182
|
+
return self._materialized
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def tools(self) -> list[Callable]:
|
|
186
|
+
"""The list of tools associated with this `Context`"""
|
|
187
|
+
return self._tools
|
|
188
|
+
|
|
189
|
+
def __str__(self) -> str:
|
|
190
|
+
return f"Context(id={self.id}, description={self.description:20s}, materialized={self.materialized})"
|
|
191
|
+
|
|
192
|
+
def set_description(self, description: str) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Update the context's description.
|
|
195
|
+
"""
|
|
196
|
+
self._description = description
|
|
197
|
+
|
|
198
|
+
def set_materialized(self, materialized: str) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Update the context's materialization status.
|
|
201
|
+
"""
|
|
202
|
+
self._materialized = materialized
|
|
203
|
+
|
|
204
|
+
def compute(self, instruction: str) -> Context:
|
|
205
|
+
# construct new description and output schema
|
|
206
|
+
new_id = hash_for_id(instruction)
|
|
207
|
+
new_description = f"Parent Context ID: {self.id}\n\nThis Context is the result of computing the following instruction on the parent context.\n\nINSTRUCTION: {instruction}\n\n"
|
|
208
|
+
inter_schema = create_schema_from_fields([{"name": f"result-{new_id}", "desc": "The result from computing the instruction on the input Context", "type": str}])
|
|
209
|
+
new_output_schema = union_schemas([self.schema, inter_schema])
|
|
210
|
+
|
|
211
|
+
# construct logical operator
|
|
212
|
+
operator = ComputeOperator(
|
|
213
|
+
input_schema=self.schema,
|
|
214
|
+
output_schema=new_output_schema,
|
|
215
|
+
context_id=new_id,
|
|
216
|
+
instruction=instruction,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return Context(id=new_id, description=new_description, operator=operator, sources=[self], materialized=False)
|
|
220
|
+
|
|
221
|
+
def search(self, search_query: str) -> Context:
|
|
222
|
+
# construct new description and output schema
|
|
223
|
+
new_id = hash_for_id(search_query)
|
|
224
|
+
new_description = f"Parent Context ID: {self.id}\n\nThis Context is the result of searching the parent context for information related to the following query.\n\nSEARCH QUERY: {search_query}\n\n"
|
|
225
|
+
|
|
226
|
+
# construct logical operator
|
|
227
|
+
operator = SearchOperator(
|
|
228
|
+
input_schema=self.schema,
|
|
229
|
+
output_schema=self.schema,
|
|
230
|
+
context_id=new_id,
|
|
231
|
+
search_query=search_query,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return Context(id=new_id, description=new_description, operator=operator, sources=[self], materialized=False)
|
|
235
|
+
|
|
236
|
+
class TextFileContext(Context):
|
|
237
|
+
def __init__(self, path: str, id: str, description: str) -> None:
|
|
238
|
+
"""
|
|
239
|
+
Constructor for the `TextFileContext` class.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
path (str): The path to the file
|
|
243
|
+
id (str): a string identifier for the `Context`
|
|
244
|
+
description (str): The description of the data contained within the `Context`
|
|
245
|
+
kwargs (dict): Keyword arguments containing the `Context's` id and description.
|
|
246
|
+
"""
|
|
247
|
+
# check that path is a valid file or directory
|
|
248
|
+
assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
|
|
249
|
+
|
|
250
|
+
# get list of filepaths
|
|
251
|
+
self.filepaths = []
|
|
252
|
+
if os.path.isfile(path):
|
|
253
|
+
self.filepaths = [path]
|
|
254
|
+
else:
|
|
255
|
+
self.filepaths = []
|
|
256
|
+
for root, _, files in os.walk(path):
|
|
257
|
+
for file in files:
|
|
258
|
+
fp = os.path.join(root, file)
|
|
259
|
+
self.filepaths.append(fp)
|
|
260
|
+
self.filepaths = sorted(self.filepaths)
|
|
261
|
+
|
|
262
|
+
# call parent constructor to set id, operator, and schema
|
|
263
|
+
schema = create_schema_from_fields([{"name": "context", "desc": "The context", "type": str}])
|
|
264
|
+
super().__init__(
|
|
265
|
+
id=id,
|
|
266
|
+
description=description,
|
|
267
|
+
operator=ContextScan(context=self, output_schema=schema),
|
|
268
|
+
schema=schema,
|
|
269
|
+
materialized=True,
|
|
270
|
+
)
|
|
271
|
+
def _check_filter_answer_text(self, answer_text: str) -> dict | None:
|
|
272
|
+
"""
|
|
273
|
+
Return {"passed_operator": True} if and only if "true" is in the answer text.
|
|
274
|
+
Return {"passed_operator": False} if and only if "false" is in the answer text.
|
|
275
|
+
Otherwise, return None.
|
|
276
|
+
"""
|
|
277
|
+
# NOTE: we may be able to eliminate this condition by specifying this JSON output in the prompt;
|
|
278
|
+
# however, that would also need to coincide with a change to allow the parse_answer_fn to set "passed_operator"
|
|
279
|
+
if "true" in answer_text.lower():
|
|
280
|
+
return {"passed_operator": True}
|
|
281
|
+
elif "false" in answer_text.lower():
|
|
282
|
+
return {"passed_operator": False}
|
|
283
|
+
elif "yes" in answer_text.lower():
|
|
284
|
+
return {"passed_operator": True}
|
|
285
|
+
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
def _parse_filter_answer(self, completion_text: str) -> dict[str, list]:
|
|
289
|
+
"""Extract the answer from the completion object for filter operations."""
|
|
290
|
+
# if the model followed the default instructions, the completion text will place
|
|
291
|
+
# its answer between "ANSWER:" and "---"
|
|
292
|
+
regex = re.compile("answer:(.*?)---", re.IGNORECASE | re.DOTALL)
|
|
293
|
+
matches = regex.findall(completion_text)
|
|
294
|
+
if len(matches) > 0:
|
|
295
|
+
answer_text = matches[0].strip()
|
|
296
|
+
field_answers = self._check_filter_answer_text(answer_text)
|
|
297
|
+
if field_answers is not None:
|
|
298
|
+
return field_answers
|
|
299
|
+
|
|
300
|
+
# if the first regex didn't find an answer, try taking all the text after "ANSWER:"
|
|
301
|
+
regex = re.compile("answer:(.*)", re.IGNORECASE | re.DOTALL)
|
|
302
|
+
matches = regex.findall(completion_text)
|
|
303
|
+
if len(matches) > 0:
|
|
304
|
+
answer_text = matches[0].strip()
|
|
305
|
+
field_answers = self._check_filter_answer_text(answer_text)
|
|
306
|
+
if field_answers is not None:
|
|
307
|
+
return field_answers
|
|
308
|
+
|
|
309
|
+
# finally, try taking all of the text; throw an exception if this doesn't work
|
|
310
|
+
field_answers = self._check_filter_answer_text(completion_text)
|
|
311
|
+
if field_answers is None:
|
|
312
|
+
raise Exception(f"Could not parse answer from completion text: {completion_text}")
|
|
313
|
+
|
|
314
|
+
return field_answers
|
|
315
|
+
|
|
316
|
+
# def tool_list_filepaths(self) -> list[str]:
|
|
317
|
+
# """
|
|
318
|
+
# This tool returns the list of all of the filepaths which the `Context` has access to.
|
|
319
|
+
|
|
320
|
+
# Args:
|
|
321
|
+
# None
|
|
322
|
+
|
|
323
|
+
# Returns:
|
|
324
|
+
# list[str]: A list of file paths for all files in the `Context`.
|
|
325
|
+
# """
|
|
326
|
+
# return self.filepaths
|
|
327
|
+
|
|
328
|
+
# def tool_read_filepath(self, path: str) -> str:
|
|
329
|
+
# """
|
|
330
|
+
# This tool takes a filepath (`path`) as input and returns the content of the file as a string.
|
|
331
|
+
# It handles both CSV files and html / regular text files. It does not handle images.
|
|
332
|
+
|
|
333
|
+
# Args:
|
|
334
|
+
# path (str): The path to the file to read.
|
|
335
|
+
|
|
336
|
+
# Returns:
|
|
337
|
+
# str: The content of the file as a string.
|
|
338
|
+
# """
|
|
339
|
+
# if path.endswith(".csv"):
|
|
340
|
+
# return pd.read_csv(path, encoding="ISO-8859-1").to_string(index=False)
|
|
341
|
+
|
|
342
|
+
# with open(path, encoding='utf-8') as file:
|
|
343
|
+
# content = file.read()
|
|
344
|
+
|
|
345
|
+
# return content
|
|
346
|
+
|
|
347
|
+
def tool_execute_semantic_operators(self, instruction: str) -> str:
|
|
348
|
+
"""
|
|
349
|
+
This tool takes an `instruction` as input and invokes an expert to write a semantic data processing pipeline
|
|
350
|
+
to execute the instruction. The tool returns the path to a CSV file which contains the output of the pipeline.
|
|
351
|
+
|
|
352
|
+
For example, the tool could be invoked as follows to extract the title and abstract from a dataset of research papers:
|
|
353
|
+
```
|
|
354
|
+
instruction = "Write a program to extract the title and abstract from each research paper"
|
|
355
|
+
result_csv_filepath = tool_execute_semantic_operators(instruction)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
instruction: The instruction specifying the semantic data processing pipeline that you need to execute.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
str: the filepath to the CSV containing the output from running the data processing pipeline.
|
|
363
|
+
"""
|
|
364
|
+
from smolagents import tool
|
|
365
|
+
@tool
|
|
366
|
+
def tool_list_filepaths() -> list[str]:
|
|
367
|
+
"""
|
|
368
|
+
This tool returns the list of all of the filepaths which the `Context` has access to.
|
|
369
|
+
|
|
370
|
+
NOTE: You may want to execute this before writing your PZ program to determine where the data lives.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
None
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
list[str]: A list of file paths for all files in the `Context`.
|
|
377
|
+
"""
|
|
378
|
+
return self.filepaths
|
|
379
|
+
|
|
380
|
+
agent = CodeAgent(
|
|
381
|
+
model=LiteLLMModel(model_id="openai/o1", api_key=os.getenv("ANTHROPIC_API_KEY")),
|
|
382
|
+
tools=[tool_list_filepaths],
|
|
383
|
+
max_steps=20,
|
|
384
|
+
planning_interval=4,
|
|
385
|
+
add_base_tools=False,
|
|
386
|
+
return_full_result=True,
|
|
387
|
+
additional_authorized_imports=["dotenv", "json", "palimpzest", "pandas"],
|
|
388
|
+
instructions=PZ_INSTRUCTION,
|
|
389
|
+
)
|
|
390
|
+
result = agent.run(instruction)
|
|
391
|
+
response = result.output
|
|
392
|
+
|
|
393
|
+
return response
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pickle
|
|
5
|
+
|
|
6
|
+
import chromadb
|
|
7
|
+
import chromadb.utils.embedding_functions as embedding_functions
|
|
8
|
+
import tiktoken
|
|
9
|
+
|
|
10
|
+
from palimpzest.constants import PZ_DIR
|
|
11
|
+
from palimpzest.core.data import context
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ContextNotFoundError(Exception):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContextManager:
|
|
19
|
+
"""
|
|
20
|
+
This class manages the long-term storage of `Contexts`. Each new `Context` is added to
|
|
21
|
+
the `ContextManager` and serialized to disk. `Contexts` are also indexed, which enables
|
|
22
|
+
PZ to search for `Context(s)` which may support `search()` and `compute()` operations.
|
|
23
|
+
"""
|
|
24
|
+
def __init__(self):
|
|
25
|
+
# create directory with serialized contexts (if it doesn't already exist)
|
|
26
|
+
self.context_dir = os.path.join(PZ_DIR, "contexts")
|
|
27
|
+
os.makedirs(self.context_dir, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
# create vector store (if it doesn't already exist)
|
|
30
|
+
self.chroma_dir = os.path.join(PZ_DIR, "chroma")
|
|
31
|
+
os.makedirs(self.chroma_dir, exist_ok=True)
|
|
32
|
+
self.chroma_client = chromadb.PersistentClient(self.chroma_dir)
|
|
33
|
+
|
|
34
|
+
# pick embedding function based on presence of API key(s)
|
|
35
|
+
self.emb_fn = None
|
|
36
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
37
|
+
self.emb_fn = embedding_functions.OpenAIEmbeddingFunction(
|
|
38
|
+
api_key=os.getenv("OPENAI_API_KEY"),
|
|
39
|
+
model_name="text-embedding-3-small"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.index = self.chroma_client.get_or_create_collection("contexts", embedding_function=self.emb_fn)
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def from_pkl(path: str) -> context.Context:
|
|
46
|
+
"""Load a `Context` from its serialized pickle file."""
|
|
47
|
+
with open(path, "rb") as f:
|
|
48
|
+
context = pickle.load(f)
|
|
49
|
+
|
|
50
|
+
return context
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def to_pkl(context: context.Context, path: str) -> None:
|
|
54
|
+
"""Write the given `Context` to a pickle file at the provided `path`."""
|
|
55
|
+
with open(path, "wb") as f:
|
|
56
|
+
pickle.dump(context, f)
|
|
57
|
+
|
|
58
|
+
def num_tokens_from_string(self, string: str, encoding_name: str) -> int:
|
|
59
|
+
"""Returns the number of tokens in a text string."""
|
|
60
|
+
encoding = tiktoken.get_encoding(encoding_name)
|
|
61
|
+
num_tokens = len(encoding.encode(string))
|
|
62
|
+
return num_tokens
|
|
63
|
+
|
|
64
|
+
def add_context(self, context: context.Context, update: bool = False) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Add the new `Context` to the `ContextManager` by serializing and writing it to disk.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
context (`Context`): the context to add to the `ContextManager`
|
|
70
|
+
update (`bool`): whether or not to update an existing context
|
|
71
|
+
|
|
72
|
+
TODO: track cost
|
|
73
|
+
"""
|
|
74
|
+
# return early if the context already exists and we're not performing an update
|
|
75
|
+
id = context.id
|
|
76
|
+
context_path = os.path.join(self.context_dir, f"{id}.pkl")
|
|
77
|
+
if os.path.exists(context_path) and update is False:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
# write the context to disk
|
|
81
|
+
ContextManager.to_pkl(context, context_path)
|
|
82
|
+
|
|
83
|
+
# compute number of tokens in context.description
|
|
84
|
+
description = context.description
|
|
85
|
+
while self.num_tokens_from_string(description, "cl100k_base") > 8192:
|
|
86
|
+
description = description[:int(0.9*len(description))]
|
|
87
|
+
|
|
88
|
+
# add context to vector store
|
|
89
|
+
context_embeddings = self.emb_fn([description])
|
|
90
|
+
context_payload = {
|
|
91
|
+
"ids": [context.id],
|
|
92
|
+
"embeddings": context_embeddings,
|
|
93
|
+
"metadatas": [{"id": context.id, "materialized": context.materialized}],
|
|
94
|
+
"documents": [context.description],
|
|
95
|
+
}
|
|
96
|
+
if update:
|
|
97
|
+
self.index.update(**context_payload)
|
|
98
|
+
else:
|
|
99
|
+
self.index.add(**context_payload)
|
|
100
|
+
|
|
101
|
+
def update_context(self, id: str, description: str, materialized: bool = True) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Update an existing `Context` with the given `id` to have the given `description`.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
id (str): the id of the updated `Context`
|
|
107
|
+
description (str): the update to the description for the specified `Context`
|
|
108
|
+
materialized (bool): boolean to set the materialization status of the `Context`
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
ContextNotFoundError: if the given `id` doesn't point to a `Context` in the `ContextManger`.
|
|
112
|
+
"""
|
|
113
|
+
context = self.get_context(id)
|
|
114
|
+
new_description = context.description + description # TODO: should description have RESULT replaced on update? as opposed to appending? should description be some pydantic BaseModel?
|
|
115
|
+
context.set_description(new_description)
|
|
116
|
+
context.set_materialized(materialized)
|
|
117
|
+
self.add_context(context, update=True)
|
|
118
|
+
|
|
119
|
+
def get_context(self, id: str) -> context.Context:
|
|
120
|
+
"""
|
|
121
|
+
Returns the `Context` specified by the given `id`.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
id (str): the id of the retrieved `Context`
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
`Context`: the specified `Context`.
|
|
128
|
+
"""
|
|
129
|
+
context_path = os.path.join(self.context_dir, f"{id}.pkl")
|
|
130
|
+
try:
|
|
131
|
+
return ContextManager.from_pkl(context_path)
|
|
132
|
+
except FileNotFoundError as err:
|
|
133
|
+
raise ContextNotFoundError from err
|
|
134
|
+
|
|
135
|
+
def search_context(self, query: str, k: int = 1, where: dict | None = None) -> list[context.Context]:
|
|
136
|
+
"""
|
|
137
|
+
Returns the top-k most relevant `Context(s)` for the given query. If provided,
|
|
138
|
+
the where dictionary will be used to filter the search results.
|
|
139
|
+
|
|
140
|
+
TODO:
|
|
141
|
+
3) update CostModel to account for benefit of using existing Context(s)
|
|
142
|
+
---
|
|
143
|
+
4) unit test
|
|
144
|
+
5) track cost
|
|
145
|
+
"""
|
|
146
|
+
# embed the search query
|
|
147
|
+
query_embeddings = self.emb_fn([query])
|
|
148
|
+
|
|
149
|
+
# look up ids of most similar contexts
|
|
150
|
+
results = self.index.query(
|
|
151
|
+
query_embeddings=query_embeddings,
|
|
152
|
+
n_results=k,
|
|
153
|
+
where=where,
|
|
154
|
+
)
|
|
155
|
+
ids = results["ids"][0]
|
|
156
|
+
|
|
157
|
+
# load and return Context objects
|
|
158
|
+
contexts = []
|
|
159
|
+
for id in ids:
|
|
160
|
+
context_path = os.path.join(self.context_dir, f"{id}.pkl")
|
|
161
|
+
contexts.append(ContextManager.from_pkl(context_path))
|
|
162
|
+
|
|
163
|
+
return contexts
|