palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.21.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,393 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from abc import ABC
6
+ from typing import Callable
7
+
8
+ from pydantic import BaseModel
9
+ from smolagents import CodeAgent, LiteLLMModel
10
+
11
+ from palimpzest.core.data import context_manager
12
+ from palimpzest.core.data.dataset import Dataset
13
+ from palimpzest.core.lib.schemas import create_schema_from_fields, union_schemas
14
+ from palimpzest.query.operators.logical import ComputeOperator, ContextScan, LogicalOperator, SearchOperator
15
+ from palimpzest.utils.hash_helpers import hash_for_id
16
+
17
+ PZ_INSTRUCTION = """\n\nYou are a CodeAgent who is a specialist at writing declarative AI programs with the Palimpzest (PZ) library.
18
+
19
+ Palimpzest is a programming framework which provides you with **semantic operators** (e.g. semantic maps, semantic filters, etc.)
20
+ which are like their traditional counterparts, except they can execute instructions provided in natural language.
21
+
22
+ For example, if you wanted to write a program to extract the title and abstract from a directory of papers,
23
+ you could write the following in PZ:
24
+ ```
25
+ import palimpzest as pz
26
+ from dotenv import load_dotenv
27
+
28
+ # Load environment variables from .env file
29
+ load_dotenv()
30
+
31
+ # Define columns for semantic map (sem_map) operation; each column is specified
32
+ # with a dictionary containing the following keys:
33
+ # - "name": the name of the field to compute
34
+ # - "type": the type of the field to compute
35
+ # - "description": the natural language description of the field
36
+ paper_cols = [
37
+ {"name": "title", "type": str, "description": "the title of the paper"},
38
+ {"name": "abstract", "type": str, "description": "the paper's abstract"},
39
+ ]
40
+
41
+ # construct the data processing pipeline with PZ
42
+ ds = pz.TextFileDataset(id="papers", path="path/to/papers")
43
+ ds = ds.sem_map(cols)
44
+
45
+ # optimize and execute the PZ program
46
+ validator = pz.Validator()
47
+ config = pz.QueryProcessorConfig(
48
+ policy=pz.MaxQuality(),
49
+ execution_strategy="parallel",
50
+ max_workers=20,
51
+ progress=True,
52
+ )
53
+ output = ds.optimize_and_run(config=config, validator=validator)
54
+
55
+ # write the execution stats to json
56
+ output.execution_stats.to_json("pz_program_stats.json")
57
+
58
+ # write the output to a CSV and print the output CSV filepath so the user knows where to find it
59
+ output_filepath = "pz_program_output.csv"
60
+ output.to_df().to_csv(output_filepath, index=False)
61
+ print(f"Results at: {output_filepath}")
62
+ ```
63
+
64
+ To initialize a dataset in PZ, simply provide the path to a directory to `pz.TextFileDirectory()`
65
+ (if your data contains text-based files). For example:
66
+ ```
67
+ import palimpzest as pz
68
+ from dotenv import load_dotenv
69
+
70
+ # Load environment variables from .env file
71
+ load_dotenv()
72
+
73
+ ds = pz.TextFileDataset(id="files", path="path/to/files")
74
+ ```
75
+
76
+ Palimpzest has two primary **semantic operators** which you can use to construct data processing pipelines:
77
+ - sem_filter(predicate: str): executes a semantic filter specified by the natural language predicate on a given PZ dataset
78
+ - sem_map(cols: list[dict]): executes a semantic map to compute the `cols` on a given PZ dataset
79
+
80
+ As a second example, consider the following PZ program which filters for papers about batteries that are from MIT
81
+ and computes a summary for each one:
82
+ ```
83
+ import palimpzest as pz
84
+ from dotenv import load_dotenv
85
+
86
+ # Load environment variables from .env file
87
+ load_dotenv()
88
+
89
+ # construct the PZ program
90
+ ds = pz.TextFileDataset(id="papers", path="path/to/research-papers")
91
+ ds = ds.sem_filter("The paper is about batteries")
92
+ ds = ds.sem_filter("The paper is from MIT")
93
+ ds = ds.sem_map([{"name": "summary", "type": str, "description": "A summary of the paper"}])
94
+
95
+ # optimize and execute the PZ program
96
+ validator = pz.Validator()
97
+ config = pz.QueryProcessorConfig(
98
+ policy=pz.MaxQuality(),
99
+ execution_strategy="parallel",
100
+ max_workers=20,
101
+ progress=True,
102
+ )
103
+ output = ds.optimize_and_run(config=config, validator=validator)
104
+
105
+ # write the execution stats to json
106
+ output.execution_stats.to_json("pz_program_stats.json")
107
+
108
+ # write the output to a CSV and print the output CSV filepath so the user knows where to find it
109
+ output_filepath = "pz_program_output.csv"
110
+ output.to_df().to_csv(output_filepath, index=False)
111
+ print(f"Results at: {output_filepath}")
112
+ ```
113
+
114
+ Be sure to always:
115
+ - execute your program using the `.optimize_and_run()` format shown above
116
+ - call `output.execution_stats.to_json("pz_program_stats.json")` to write execution statistics to disk
117
+ - write your output to CSV and print where you wrote it!
118
+ """
119
+
120
+ class Context(Dataset, ABC):
121
+ """
122
+ The `Context` class is an abstract base class for root `Datasets` whose data is accessed
123
+ via user-defined methods. Classes which inherit from this class must implement two methods:
124
+
125
+ - `list_filepaths()`: which lists the files that the `Context` has access to.
126
+ - `read_filepath(path: str)`: which reads the file corresponding to the given `path`.
127
+
128
+ A `Context` is a special type of `Dataset` that represents a view over an underlying `Dataset`.
129
+ Each `Context` has a `name` which uniquely identifies it, as well as a natural language `description`
130
+ of the data / computation that the `Context` represents. Similar to `Dataset`s, `Context`s can be
131
+ lazily transformed using functions such as `sem_filter`, `sem_map`, `sem_join`, etc., and they may
132
+ be materialized or unmaterialized.
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ id: str,
138
+ description: str,
139
+ operator: LogicalOperator,
140
+ schema: type[BaseModel] | None = None,
141
+ sources: list[Context] | Context | None = None,
142
+ materialized: bool = False,
143
+ ) -> None:
144
+ """
145
+ Constructor for the `Context` class.
146
+
147
+ Args:
148
+ id (`str`): a string identifier for the `Context`
149
+ description (`str`): the description of the data contained within the `Context`
150
+ operator (`LogicalOperator`): The `LogicalOperator` used to compute this `Context`.
151
+ schema: (`type[BaseModel] | None`): The schema of this `Context`.
152
+ sources (`list[Context] | Context | None`): The (list of) `Context(s)` which are input(s) to
153
+ the operator used to compute this `Context`.
154
+ materialized (`bool`): True if the `Context` has been computed, False otherwise
155
+ """
156
+ # set the description
157
+ self._description = description
158
+
159
+ # set the materialization status
160
+ self._materialized = materialized
161
+
162
+ # compute schema and call parent constructor
163
+ if schema is None:
164
+ schema = create_schema_from_fields([{"name": "context", "description": "The context", "type": str}])
165
+ super().__init__(sources=sources, operator=operator, schema=schema, id=id)
166
+
167
+ # set the tools associated with this Context
168
+ self._tools = [getattr(self, attr) for attr in dir(self) if attr.startswith("tool_")]
169
+
170
+ # add Context to ContextManager
171
+ cm = context_manager.ContextManager()
172
+ cm.add_context(self)
173
+
174
+ @property
175
+ def description(self) -> str:
176
+ """The string containing all of the information computed for this `Context`"""
177
+ return self._description
178
+
179
+ @property
180
+ def materialized(self) -> bool:
181
+ """The boolean which specifies whether the `Context` has been computed or not"""
182
+ return self._materialized
183
+
184
+ @property
185
+ def tools(self) -> list[Callable]:
186
+ """The list of tools associated with this `Context`"""
187
+ return self._tools
188
+
189
+ def __str__(self) -> str:
190
+ return f"Context(id={self.id}, description={self.description:20s}, materialized={self.materialized})"
191
+
192
+ def set_description(self, description: str) -> None:
193
+ """
194
+ Update the context's description.
195
+ """
196
+ self._description = description
197
+
198
+ def set_materialized(self, materialized: str) -> None:
199
+ """
200
+ Update the context's materialization status.
201
+ """
202
+ self._materialized = materialized
203
+
204
+ def compute(self, instruction: str) -> Context:
205
+ # construct new description and output schema
206
+ new_id = hash_for_id(instruction)
207
+ new_description = f"Parent Context ID: {self.id}\n\nThis Context is the result of computing the following instruction on the parent context.\n\nINSTRUCTION: {instruction}\n\n"
208
+ inter_schema = create_schema_from_fields([{"name": f"result-{new_id}", "desc": "The result from computing the instruction on the input Context", "type": str}])
209
+ new_output_schema = union_schemas([self.schema, inter_schema])
210
+
211
+ # construct logical operator
212
+ operator = ComputeOperator(
213
+ input_schema=self.schema,
214
+ output_schema=new_output_schema,
215
+ context_id=new_id,
216
+ instruction=instruction,
217
+ )
218
+
219
+ return Context(id=new_id, description=new_description, operator=operator, sources=[self], materialized=False)
220
+
221
+ def search(self, search_query: str) -> Context:
222
+ # construct new description and output schema
223
+ new_id = hash_for_id(search_query)
224
+ new_description = f"Parent Context ID: {self.id}\n\nThis Context is the result of searching the parent context for information related to the following query.\n\nSEARCH QUERY: {search_query}\n\n"
225
+
226
+ # construct logical operator
227
+ operator = SearchOperator(
228
+ input_schema=self.schema,
229
+ output_schema=self.schema,
230
+ context_id=new_id,
231
+ search_query=search_query,
232
+ )
233
+
234
+ return Context(id=new_id, description=new_description, operator=operator, sources=[self], materialized=False)
235
+
236
+ class TextFileContext(Context):
237
+ def __init__(self, path: str, id: str, description: str) -> None:
238
+ """
239
+ Constructor for the `TextFileContext` class.
240
+
241
+ Args:
242
+ path (str): The path to the file
243
+ id (str): a string identifier for the `Context`
244
+ description (str): The description of the data contained within the `Context`
245
+ kwargs (dict): Keyword arguments containing the `Context's` id and description.
246
+ """
247
+ # check that path is a valid file or directory
248
+ assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
249
+
250
+ # get list of filepaths
251
+ self.filepaths = []
252
+ if os.path.isfile(path):
253
+ self.filepaths = [path]
254
+ else:
255
+ self.filepaths = []
256
+ for root, _, files in os.walk(path):
257
+ for file in files:
258
+ fp = os.path.join(root, file)
259
+ self.filepaths.append(fp)
260
+ self.filepaths = sorted(self.filepaths)
261
+
262
+ # call parent constructor to set id, operator, and schema
263
+ schema = create_schema_from_fields([{"name": "context", "desc": "The context", "type": str}])
264
+ super().__init__(
265
+ id=id,
266
+ description=description,
267
+ operator=ContextScan(context=self, output_schema=schema),
268
+ schema=schema,
269
+ materialized=True,
270
+ )
271
+ def _check_filter_answer_text(self, answer_text: str) -> dict | None:
272
+ """
273
+ Return {"passed_operator": True} if and only if "true" is in the answer text.
274
+ Return {"passed_operator": False} if and only if "false" is in the answer text.
275
+ Otherwise, return None.
276
+ """
277
+ # NOTE: we may be able to eliminate this condition by specifying this JSON output in the prompt;
278
+ # however, that would also need to coincide with a change to allow the parse_answer_fn to set "passed_operator"
279
+ if "true" in answer_text.lower():
280
+ return {"passed_operator": True}
281
+ elif "false" in answer_text.lower():
282
+ return {"passed_operator": False}
283
+ elif "yes" in answer_text.lower():
284
+ return {"passed_operator": True}
285
+
286
+ return None
287
+
288
+ def _parse_filter_answer(self, completion_text: str) -> dict[str, list]:
289
+ """Extract the answer from the completion object for filter operations."""
290
+ # if the model followed the default instructions, the completion text will place
291
+ # its answer between "ANSWER:" and "---"
292
+ regex = re.compile("answer:(.*?)---", re.IGNORECASE | re.DOTALL)
293
+ matches = regex.findall(completion_text)
294
+ if len(matches) > 0:
295
+ answer_text = matches[0].strip()
296
+ field_answers = self._check_filter_answer_text(answer_text)
297
+ if field_answers is not None:
298
+ return field_answers
299
+
300
+ # if the first regex didn't find an answer, try taking all the text after "ANSWER:"
301
+ regex = re.compile("answer:(.*)", re.IGNORECASE | re.DOTALL)
302
+ matches = regex.findall(completion_text)
303
+ if len(matches) > 0:
304
+ answer_text = matches[0].strip()
305
+ field_answers = self._check_filter_answer_text(answer_text)
306
+ if field_answers is not None:
307
+ return field_answers
308
+
309
+ # finally, try taking all of the text; throw an exception if this doesn't work
310
+ field_answers = self._check_filter_answer_text(completion_text)
311
+ if field_answers is None:
312
+ raise Exception(f"Could not parse answer from completion text: {completion_text}")
313
+
314
+ return field_answers
315
+
316
+ # def tool_list_filepaths(self) -> list[str]:
317
+ # """
318
+ # This tool returns the list of all of the filepaths which the `Context` has access to.
319
+
320
+ # Args:
321
+ # None
322
+
323
+ # Returns:
324
+ # list[str]: A list of file paths for all files in the `Context`.
325
+ # """
326
+ # return self.filepaths
327
+
328
+ # def tool_read_filepath(self, path: str) -> str:
329
+ # """
330
+ # This tool takes a filepath (`path`) as input and returns the content of the file as a string.
331
+ # It handles both CSV files and html / regular text files. It does not handle images.
332
+
333
+ # Args:
334
+ # path (str): The path to the file to read.
335
+
336
+ # Returns:
337
+ # str: The content of the file as a string.
338
+ # """
339
+ # if path.endswith(".csv"):
340
+ # return pd.read_csv(path, encoding="ISO-8859-1").to_string(index=False)
341
+
342
+ # with open(path, encoding='utf-8') as file:
343
+ # content = file.read()
344
+
345
+ # return content
346
+
347
+ def tool_execute_semantic_operators(self, instruction: str) -> str:
348
+ """
349
+ This tool takes an `instruction` as input and invokes an expert to write a semantic data processing pipeline
350
+ to execute the instruction. The tool returns the path to a CSV file which contains the output of the pipeline.
351
+
352
+ For example, the tool could be invoked as follows to extract the title and abstract from a dataset of research papers:
353
+ ```
354
+ instruction = "Write a program to extract the title and abstract from each research paper"
355
+ result_csv_filepath = tool_execute_semantic_operators(instruction)
356
+ ```
357
+
358
+ Args:
359
+ instruction: The instruction specifying the semantic data processing pipeline that you need to execute.
360
+
361
+ Returns:
362
+ str: the filepath to the CSV containing the output from running the data processing pipeline.
363
+ """
364
+ from smolagents import tool
365
+ @tool
366
+ def tool_list_filepaths() -> list[str]:
367
+ """
368
+ This tool returns the list of all of the filepaths which the `Context` has access to.
369
+
370
+ NOTE: You may want to execute this before writing your PZ program to determine where the data lives.
371
+
372
+ Args:
373
+ None
374
+
375
+ Returns:
376
+ list[str]: A list of file paths for all files in the `Context`.
377
+ """
378
+ return self.filepaths
379
+
380
+ agent = CodeAgent(
381
+ model=LiteLLMModel(model_id="openai/o1", api_key=os.getenv("ANTHROPIC_API_KEY")),
382
+ tools=[tool_list_filepaths],
383
+ max_steps=20,
384
+ planning_interval=4,
385
+ add_base_tools=False,
386
+ return_full_result=True,
387
+ additional_authorized_imports=["dotenv", "json", "palimpzest", "pandas"],
388
+ instructions=PZ_INSTRUCTION,
389
+ )
390
+ result = agent.run(instruction)
391
+ response = result.output
392
+
393
+ return response
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+
6
+ import chromadb
7
+ import chromadb.utils.embedding_functions as embedding_functions
8
+ import tiktoken
9
+
10
+ from palimpzest.constants import PZ_DIR
11
+ from palimpzest.core.data import context
12
+
13
+
14
+ class ContextNotFoundError(Exception):
15
+ pass
16
+
17
+
18
+ class ContextManager:
19
+ """
20
+ This class manages the long-term storage of `Contexts`. Each new `Context` is added to
21
+ the `ContextManager` and serialized to disk. `Contexts` are also indexed, which enables
22
+ PZ to search for `Context(s)` which may support `search()` and `compute()` operations.
23
+ """
24
+ def __init__(self):
25
+ # create directory with serialized contexts (if it doesn't already exist)
26
+ self.context_dir = os.path.join(PZ_DIR, "contexts")
27
+ os.makedirs(self.context_dir, exist_ok=True)
28
+
29
+ # create vector store (if it doesn't already exist)
30
+ self.chroma_dir = os.path.join(PZ_DIR, "chroma")
31
+ os.makedirs(self.chroma_dir, exist_ok=True)
32
+ self.chroma_client = chromadb.PersistentClient(self.chroma_dir)
33
+
34
+ # pick embedding function based on presence of API key(s)
35
+ self.emb_fn = None
36
+ if os.getenv("OPENAI_API_KEY"):
37
+ self.emb_fn = embedding_functions.OpenAIEmbeddingFunction(
38
+ api_key=os.getenv("OPENAI_API_KEY"),
39
+ model_name="text-embedding-3-small"
40
+ )
41
+
42
+ self.index = self.chroma_client.get_or_create_collection("contexts", embedding_function=self.emb_fn)
43
+
44
+ @staticmethod
45
+ def from_pkl(path: str) -> context.Context:
46
+ """Load a `Context` from its serialized pickle file."""
47
+ with open(path, "rb") as f:
48
+ context = pickle.load(f)
49
+
50
+ return context
51
+
52
+ @staticmethod
53
+ def to_pkl(context: context.Context, path: str) -> None:
54
+ """Write the given `Context` to a pickle file at the provided `path`."""
55
+ with open(path, "wb") as f:
56
+ pickle.dump(context, f)
57
+
58
+ def num_tokens_from_string(self, string: str, encoding_name: str) -> int:
59
+ """Returns the number of tokens in a text string."""
60
+ encoding = tiktoken.get_encoding(encoding_name)
61
+ num_tokens = len(encoding.encode(string))
62
+ return num_tokens
63
+
64
+ def add_context(self, context: context.Context, update: bool = False) -> None:
65
+ """
66
+ Add the new `Context` to the `ContextManager` by serializing and writing it to disk.
67
+
68
+ Args:
69
+ context (`Context`): the context to add to the `ContextManager`
70
+ update (`bool`): whether or not to update an existing context
71
+
72
+ TODO: track cost
73
+ """
74
+ # return early if the context already exists and we're not performing an update
75
+ id = context.id
76
+ context_path = os.path.join(self.context_dir, f"{id}.pkl")
77
+ if os.path.exists(context_path) and update is False:
78
+ return
79
+
80
+ # write the context to disk
81
+ ContextManager.to_pkl(context, context_path)
82
+
83
+ # compute number of tokens in context.description
84
+ description = context.description
85
+ while self.num_tokens_from_string(description, "cl100k_base") > 8192:
86
+ description = description[:int(0.9*len(description))]
87
+
88
+ # add context to vector store
89
+ context_embeddings = self.emb_fn([description])
90
+ context_payload = {
91
+ "ids": [context.id],
92
+ "embeddings": context_embeddings,
93
+ "metadatas": [{"id": context.id, "materialized": context.materialized}],
94
+ "documents": [context.description],
95
+ }
96
+ if update:
97
+ self.index.update(**context_payload)
98
+ else:
99
+ self.index.add(**context_payload)
100
+
101
+ def update_context(self, id: str, description: str, materialized: bool = True) -> None:
102
+ """
103
+ Update an existing `Context` with the given `id` to have the given `description`.
104
+
105
+ Args:
106
+ id (str): the id of the updated `Context`
107
+ description (str): the update to the description for the specified `Context`
108
+ materialized (bool): boolean to set the materialization status of the `Context`
109
+
110
+ Raises:
111
+ ContextNotFoundError: if the given `id` doesn't point to a `Context` in the `ContextManger`.
112
+ """
113
+ context = self.get_context(id)
114
+ new_description = context.description + description # TODO: should description have RESULT replaced on update? as opposed to appending? should description be some pydantic BaseModel?
115
+ context.set_description(new_description)
116
+ context.set_materialized(materialized)
117
+ self.add_context(context, update=True)
118
+
119
+ def get_context(self, id: str) -> context.Context:
120
+ """
121
+ Returns the `Context` specified by the given `id`.
122
+
123
+ Args:
124
+ id (str): the id of the retrieved `Context`
125
+
126
+ Returns:
127
+ `Context`: the specified `Context`.
128
+ """
129
+ context_path = os.path.join(self.context_dir, f"{id}.pkl")
130
+ try:
131
+ return ContextManager.from_pkl(context_path)
132
+ except FileNotFoundError as err:
133
+ raise ContextNotFoundError from err
134
+
135
+ def search_context(self, query: str, k: int = 1, where: dict | None = None) -> list[context.Context]:
136
+ """
137
+ Returns the top-k most relevant `Context(s)` for the given query. If provided,
138
+ the where dictionary will be used to filter the search results.
139
+
140
+ TODO:
141
+ 3) update CostModel to account for benefit of using existing Context(s)
142
+ ---
143
+ 4) unit test
144
+ 5) track cost
145
+ """
146
+ # embed the search query
147
+ query_embeddings = self.emb_fn([query])
148
+
149
+ # look up ids of most similar contexts
150
+ results = self.index.query(
151
+ query_embeddings=query_embeddings,
152
+ n_results=k,
153
+ where=where,
154
+ )
155
+ ids = results["ids"][0]
156
+
157
+ # load and return Context objects
158
+ contexts = []
159
+ for id in ids:
160
+ context_path = os.path.join(self.context_dir, f"{id}.pkl")
161
+ contexts.append(ContextManager.from_pkl(context_path))
162
+
163
+ return contexts