palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. palimpzest/__init__.py +7 -9
  2. palimpzest/constants.py +47 -7
  3. palimpzest/core/__init__.py +20 -26
  4. palimpzest/core/data/dataclasses.py +9 -2
  5. palimpzest/core/data/datareaders.py +497 -0
  6. palimpzest/core/elements/records.py +29 -37
  7. palimpzest/core/lib/fields.py +14 -12
  8. palimpzest/core/lib/schemas.py +80 -94
  9. palimpzest/policy.py +58 -0
  10. palimpzest/prompts/__init__.py +22 -0
  11. palimpzest/prompts/code_synthesis_prompts.py +28 -0
  12. palimpzest/prompts/convert_prompts.py +87 -0
  13. palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
  14. palimpzest/prompts/filter_prompts.py +69 -0
  15. palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
  16. palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
  17. palimpzest/prompts/prompt_factory.py +732 -0
  18. palimpzest/prompts/util_phrases.py +14 -0
  19. palimpzest/query/execution/execution_strategy.py +0 -3
  20. palimpzest/query/execution/parallel_execution_strategy.py +12 -25
  21. palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
  22. palimpzest/query/generators/generators.py +71 -347
  23. palimpzest/query/operators/__init__.py +5 -5
  24. palimpzest/query/operators/aggregate.py +10 -5
  25. palimpzest/query/operators/code_synthesis_convert.py +4 -48
  26. palimpzest/query/operators/convert.py +5 -2
  27. palimpzest/query/operators/critique_and_refine_convert.py +112 -0
  28. palimpzest/query/operators/filter.py +1 -1
  29. palimpzest/query/operators/limit.py +1 -1
  30. palimpzest/query/operators/logical.py +28 -27
  31. palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
  32. palimpzest/query/operators/physical.py +32 -20
  33. palimpzest/query/operators/project.py +1 -1
  34. palimpzest/query/operators/rag_convert.py +6 -3
  35. palimpzest/query/operators/retrieve.py +13 -31
  36. palimpzest/query/operators/scan.py +150 -0
  37. palimpzest/query/optimizer/__init__.py +5 -1
  38. palimpzest/query/optimizer/cost_model.py +18 -34
  39. palimpzest/query/optimizer/optimizer.py +40 -25
  40. palimpzest/query/optimizer/optimizer_strategy.py +26 -0
  41. palimpzest/query/optimizer/plan.py +2 -2
  42. palimpzest/query/optimizer/rules.py +118 -27
  43. palimpzest/query/processor/config.py +12 -1
  44. palimpzest/query/processor/mab_sentinel_processor.py +125 -112
  45. palimpzest/query/processor/nosentinel_processor.py +46 -62
  46. palimpzest/query/processor/query_processor.py +10 -20
  47. palimpzest/query/processor/query_processor_factory.py +12 -5
  48. palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
  49. palimpzest/query/processor/streaming_processor.py +11 -17
  50. palimpzest/sets.py +170 -94
  51. palimpzest/tools/pdfparser.py +5 -64
  52. palimpzest/utils/datareader_helpers.py +61 -0
  53. palimpzest/utils/field_helpers.py +69 -0
  54. palimpzest/utils/hash_helpers.py +3 -2
  55. palimpzest/utils/udfs.py +0 -28
  56. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
  57. palimpzest-0.6.1.dist-info/RECORD +87 -0
  58. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
  59. cli/README.md +0 -156
  60. cli/__init__.py +0 -0
  61. cli/cli_main.py +0 -390
  62. palimpzest/config.py +0 -89
  63. palimpzest/core/data/datasources.py +0 -369
  64. palimpzest/datamanager/__init__.py +0 -0
  65. palimpzest/datamanager/datamanager.py +0 -300
  66. palimpzest/prompts.py +0 -397
  67. palimpzest/query/operators/datasource.py +0 -202
  68. palimpzest-0.5.4.dist-info/RECORD +0 -83
  69. palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
  70. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
  71. {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0
cli/cli_main.py DELETED
@@ -1,390 +0,0 @@
1
- import os
2
- import subprocess
3
- from typing import Tuple
4
-
5
- import click
6
- import yaml
7
- from click_aliases import ClickAliasedGroup
8
- from prettytable import PrettyTable
9
-
10
-
11
- ############ DEFINITIONS ############
12
- class InvalidCommandError(Exception):
13
- pass
14
-
15
-
16
- ############## HELPERS ##############
17
- def _print_msg(msg: str) -> None:
18
- """
19
- Helper function to print messages in CLI-specific format. Currently just a wrapper around print(),
20
- could easily be extended to include color/formatted output.
21
-
22
- Parameters
23
- ----------
24
- msg: str
25
- Message to print to the console.
26
- """
27
- print(f"{msg}")
28
-
29
-
30
- def _run_bash_command(command: str) -> Tuple[str, str]:
31
- """
32
- Helper function to split a bash command on spaces and execute it using subprocess.
33
-
34
- Parameters
35
- ----------
36
- command: str
37
- Shell command to execute with subprocess.
38
-
39
- Returns
40
- -------
41
- Tuple[str, str]
42
- Tuple returning the stdout and stderr from running the shell command.
43
- """
44
- # split command on spaces into list of strings
45
- command_str_lst = command.split(" ")
46
-
47
- # execute command and capture the output
48
- out = subprocess.run(command_str_lst, capture_output=True)
49
-
50
- # return stdout as string
51
- return str(out.stdout, "utf-8"), str(out.stderr, "utf-8")
52
-
53
-
54
- def _help() -> str:
55
- """
56
- Syntactic sugar to call `pz --help` when a user calls `pz help`.
57
-
58
- Returns
59
- -------
60
- str
61
- The help text for the pz CLI.
62
- """
63
- # execute the help command using subprocess and return output
64
- stdout, _ = _run_bash_command("pz --help")
65
-
66
- return stdout
67
-
68
-
69
- ############ CLICK API ##############
70
- @click.group(cls=ClickAliasedGroup)
71
- def cli():
72
- """
73
- The CLI tool for Palimpzest.
74
- """
75
- pass
76
-
77
-
78
- @cli.command(aliases=["h"])
79
- def help() -> None:
80
- """
81
- Print the help message for PZ.
82
- """
83
- _print_msg(_help())
84
-
85
-
86
- @cli.command(aliases=["i"])
87
- def init() -> None:
88
- """
89
- Initialize data directory for PZ.
90
- """
91
- # set directory and initialize it for PZ
92
- import palimpzest.datamanager.datamanager as pzdm
93
- from palimpzest.constants import PZ_DIR
94
-
95
- pzdm.DataDirectory()
96
- _print_msg(f"Palimpzest system initialized in: {PZ_DIR}")
97
-
98
-
99
- @cli.command(aliases=["lsdata", "ls"])
100
- def ls_data() -> None:
101
- """
102
- Print a table listing the datasets registered with PZ.
103
- """
104
- # fetch list of registered datasets
105
- # Time the loading of the library
106
- import palimpzest.datamanager.datamanager as pzdm
107
-
108
- ds = pzdm.DataDirectory().list_registered_datasets()
109
-
110
- # construct table for printing
111
- table = [["Name", "Type", "Path"]]
112
- for path, descriptor in ds:
113
- table.append([path, descriptor[0], descriptor[1]])
114
-
115
- # print table of registered datasets
116
- t = PrettyTable(table[0])
117
- t.add_rows(table[1:])
118
- _print_msg(str(t))
119
- _print_msg("")
120
- _print_msg(f"Total datasets: {len(table) - 1}")
121
-
122
-
123
- @cli.command(aliases=["synthesize", "syn"])
124
- @click.option("--name", type=str, default=None, required=True, help="Registered name for the set of synthetic values")
125
- @click.option("--count", type=int, default=100, help="How many values should be synthesized")
126
- def synthesize_data(name: str, count: int) -> None:
127
- """
128
- Register a synthetic set of values with PZ
129
-
130
- Parameters
131
- ----------
132
- name: str
133
- Name to register the data set with
134
-
135
- count: int
136
- The nunber of values to synthesize
137
- """
138
- import palimpzest.datamanager.datamanager as pzdm
139
-
140
- name = name.strip()
141
-
142
- vals = []
143
- for i in range(0, count):
144
- vals.append(i)
145
- pzdm.DataDirectory().register_dataset(vals, name)
146
-
147
- _print_msg(f"Registered {name}")
148
-
149
-
150
- @cli.command(aliases=["register", "reg", "r"])
151
- @click.option("--path", type=str, default=None, required=True, help="File or directory to register as dataset.")
152
- @click.option("--name", type=str, default=None, required=True, help="Registered name for the file/dir.")
153
- def register_data(path: str, name: str) -> None:
154
- """
155
- Register a data file or data directory with PZ.
156
-
157
- Parameters
158
- ----------
159
- path: str
160
- Path to the data file or directory to register with PZ.
161
-
162
- name: str
163
- Name to register the data file / directory with.
164
- """
165
- import palimpzest.datamanager.datamanager as pzdm
166
-
167
- # parse path and name
168
- path = path.strip()
169
- name = name.strip()
170
-
171
- # register dataset
172
- if os.path.isfile(path):
173
- pzdm.DataDirectory().register_local_file(os.path.abspath(path), name)
174
-
175
- elif os.path.isdir(path):
176
- pzdm.DataDirectory().register_local_directory(os.path.abspath(path), name)
177
-
178
- else:
179
- raise InvalidCommandError(f"Path {path} is invalid. Does not point to a file or directory.")
180
-
181
- _print_msg(f"Registered {name}")
182
-
183
-
184
- @cli.command(aliases=["rmdata", "rm"])
185
- @click.option("--name", type=str, default=None, required=True, help="Name of registered dataset to be removed.")
186
- def rm_data(name: str) -> None:
187
- """
188
- Remove a dataset that was registered with PZ.
189
-
190
- Parameters
191
- ----------
192
- name: str
193
- Name of the dataset to unregister.
194
- """
195
- import palimpzest.datamanager.datamanager as pzdm
196
-
197
- # parse name
198
- name = name.strip()
199
-
200
- # remove dataset from registry
201
- pzdm.DataDirectory().rm_registered_dataset(name)
202
-
203
- _print_msg(f"Deleted {name}")
204
-
205
-
206
- @cli.command(aliases=["clear", "clr"])
207
- def clear_cache() -> None:
208
- """
209
- Clear the Palimpzest cache.
210
- """
211
- import palimpzest.datamanager.datamanager as pzdm
212
-
213
- pzdm.DataDirectory().clear_cache(keep_registry=True)
214
- _print_msg("Cache cleared")
215
-
216
-
217
- @cli.command(aliases=["config", "pc"])
218
- def print_config() -> None:
219
- """
220
- Print the current config that Palimpzest is using.
221
- """
222
- import palimpzest.datamanager.datamanager as pzdm
223
-
224
- # load config yaml file
225
- config = pzdm.DataDirectory().get_config()
226
-
227
- # print contents of config
228
- _print_msg(f"--- {config['name']} ---\n{yaml.dump(config)}")
229
-
230
-
231
- @cli.command(aliases=["cc"])
232
- @click.option("--name", type=str, default=None, required=True, help="Name of the config to create.")
233
- @click.option(
234
- "--llmservice",
235
- type=click.Choice(["openai", "together", "google"], case_sensitive=False),
236
- default="openai",
237
- help="Name of the LLM service to use.",
238
- )
239
- @click.option("--parallel", type=bool, default=False, help="Whether to run operations in parallel or not.")
240
- @click.option("--set", type=bool, is_flag=True, help="Set the created config to be the current config.")
241
- def create_config(name: str, llmservice: str, parallel: bool, set: bool) -> None:
242
- """
243
- Create a Palimpzest config. You must set the `name` field. You may optionally
244
- set the `llmservice` and `parallel` fields (default to )
245
-
246
- Parameters
247
- ----------
248
- name: str
249
- Name of the config to create.
250
- llmservice: str
251
- Name of the LLM service to use.
252
- parallel: bool
253
- Whether to run operations in parallel or not.
254
- set: bool
255
- If this flag is present, it will set the created config to be
256
- the current config.
257
- """
258
- from palimpzest.config import Config
259
- from palimpzest.constants import PZ_DIR
260
-
261
- # check that config name is unique
262
- if os.path.exists(os.path.join(PZ_DIR, f"config_{name}.yaml")):
263
- raise InvalidCommandError(f"Config with name {name} already exists.")
264
-
265
- # create config
266
- config = Config(name, llmservice, parallel)
267
-
268
- # set newly created config to be the current config if specified
269
- if set:
270
- config.set_current_config()
271
-
272
- _print_msg(f"Created config: {name}" if set is False else f"Created and set config: {name}")
273
-
274
-
275
- @cli.command(aliases=["rmconfig", "rmc"])
276
- @click.option("--name", type=str, default=None, required=True, help="Name of the config to remove.")
277
- def rm_config(name: str) -> None:
278
- """
279
- Remove the specified config from Palimpzest. You cannot remove the default config.
280
- If this config was the current config, the current config will be set to the default config.
281
-
282
- Parameters
283
- ----------
284
- name: str
285
- Name of the config to remove.
286
- """
287
- from palimpzest.config import Config
288
- from palimpzest.constants import PZ_DIR
289
-
290
- # check that config exists
291
- if not os.path.exists(os.path.join(PZ_DIR, f"config_{name}.yaml")):
292
- raise InvalidCommandError(f"Config with name {name} does not exist.")
293
-
294
- # load the specified config
295
- config = Config(name)
296
-
297
- # remove the config; this will update the current config as well
298
- config.remove_config()
299
- _print_msg(f"Deleted config: {name}")
300
-
301
-
302
- @cli.command(aliases=["set", "sc"])
303
- @click.option("--name", type=str, default=None, required=True, help="Name of the config to set as the current config.")
304
- def set_config(name: str) -> None:
305
- """
306
- Set the current config for Palimpzest to use.
307
-
308
- Parameters
309
- ----------
310
- name: str
311
- Name of the config to set as the current config.
312
- """
313
- from palimpzest.config import Config
314
- from palimpzest.constants import PZ_DIR
315
-
316
- # check that config exists
317
- if not os.path.exists(os.path.join(PZ_DIR, f"config_{name}.yaml")):
318
- raise InvalidCommandError(f"Config with name {name} does not exist.")
319
-
320
- # load the specified config
321
- config = Config(name)
322
-
323
- # set the config as the current config
324
- config.set_current_config()
325
- _print_msg(f"Set config: {name}")
326
-
327
-
328
- @cli.command(aliases=["uc", "update"])
329
- @click.option("--name", type=str, default=None, required=True, help="Name of the config to update.")
330
- @click.option(
331
- "--settings",
332
- type=str,
333
- required=True,
334
- help="Parameters to update in format 'param1=value1,param2=value2'. Example: 'llmservice=openai,parallel=true,pdfprocessor=pdfplumber'"
335
- )
336
- def update_config(name: str, settings: str) -> None:
337
- """
338
- Update multiple parameters in an existing Palimpzest config.
339
-
340
- Parameters
341
- ----------
342
- name: str
343
- Name of the config to update
344
- params: str
345
- Comma-separated list of parameter=value pairs to update
346
- """
347
- from palimpzest.config import Config
348
- from palimpzest.constants import PZ_DIR
349
-
350
- # check that config exists
351
- if not os.path.exists(os.path.join(PZ_DIR, f"config_{name}.yaml")):
352
- raise InvalidCommandError(f"Config with name {name} does not exist.")
353
-
354
- # load the specified config
355
- config = Config(name)
356
-
357
- # Parse the params string into a dictionary
358
- try:
359
- param_pairs = settings.split(',')
360
- updates = {}
361
- for pair in param_pairs:
362
- if pair.strip() == "":
363
- continue
364
- param, value = pair.split('=')
365
- updates[param.strip()] = value.strip()
366
- except Exception as e:
367
- raise InvalidCommandError("Invalid params format. Use: param1=value1,param2=value2") from e
368
-
369
- # Update each parameter
370
- for param, value in updates.items():
371
- config.set(param, value)
372
-
373
- _print_msg(f"Updated config {name} with: {updates}")
374
-
375
- def main():
376
- """
377
- Entrypoint for Palimpzest CLI tool implemented using Click.
378
- """
379
- cli.add_command(help)
380
- cli.add_command(init)
381
- cli.add_command(ls_data)
382
- cli.add_command(register_data)
383
- cli.add_command(rm_data)
384
- cli.add_command(clear_cache)
385
- cli.add_command(print_config)
386
- cli.add_command(create_config)
387
- cli.add_command(rm_config)
388
- cli.add_command(set_config)
389
- cli.add_command(update_config)
390
- cli()
palimpzest/config.py DELETED
@@ -1,89 +0,0 @@
1
- import os
2
- import sys
3
- import tempfile
4
-
5
- import yaml
6
-
7
- from palimpzest.constants import PZ_DIR
8
-
9
-
10
- class Config:
11
- def __init__(self, name: str = "default", llmservice: str = "openai", parallel: bool = False):
12
- self.config_file_path = os.path.join(PZ_DIR, f"config_{name}.yaml")
13
- if not os.path.exists(PZ_DIR):
14
- raise Exception(
15
- f"Target config directory does not exist at {PZ_DIR} :: Something is wrong with the installation."
16
- )
17
-
18
- if not os.path.exists(self.config_file_path):
19
- # Get the system's temporary directory
20
- temp_dir = tempfile.gettempdir()
21
- pz_file_cache_dir = os.path.join(temp_dir, "pz")
22
- self.config = {
23
- "name": name,
24
- "llmservice": llmservice,
25
- "parallel": parallel,
26
- "filecachedir": pz_file_cache_dir,
27
- }
28
- self._save_config()
29
-
30
- self.config = self._load_config()
31
- if "filecachedir" not in self.config:
32
- # Get the system's temporary directory
33
- temp_dir = tempfile.gettempdir()
34
- pz_file_cache_dir = os.path.join(temp_dir, "pz")
35
- self.config["filecachedir"] = pz_file_cache_dir
36
- self._save_config()
37
-
38
- self.name = self.config["name"]
39
-
40
- def get(self, key, default=None):
41
- return self.config.get(key, default)
42
-
43
- def set(self, key, value):
44
- self.config[key] = value
45
- self._save_config()
46
-
47
- def set_current_config(self):
48
- current_config_dict = {"current_config_name": self.name}
49
- current_config_path = os.path.join(PZ_DIR, "current_config.yaml")
50
- with open(current_config_path, "w") as f:
51
- yaml.dump(current_config_dict, f)
52
-
53
- def remove_config(self):
54
- # check to ensure you don't delete default config
55
- if self.name == "default":
56
- raise Exception("Cannot remove default config.")
57
-
58
- # reset current config if this config was the current config
59
- current_config_path = os.path.join(PZ_DIR, "current_config.yaml")
60
- current_config_dict = {}
61
- with open(current_config_path) as f:
62
- current_config_dict = yaml.safe_load(f)
63
-
64
- if current_config_dict["current_config_name"] == self.name:
65
- current_config_dict["current_config_name"] = "default"
66
-
67
- with open(current_config_path, "w") as f:
68
- yaml.dump(current_config_dict, f)
69
-
70
- # delete config file
71
- os.remove(self.config_file_path)
72
-
73
- def _load_config(self):
74
- """Load YAML configuration from the specified path."""
75
- try:
76
- with open(self.config_file_path) as file:
77
- return yaml.safe_load(file)
78
- except Exception as e:
79
- print(f"Error loading configuration file: {e}")
80
- sys.exit(1)
81
-
82
- def _save_config(self):
83
- """Save the configuration to the specified path."""
84
- try:
85
- with open(self.config_file_path, "w") as file:
86
- yaml.dump(self.config, file)
87
- except Exception as e:
88
- print(f"Error saving configuration file: {e}")
89
- sys.exit(1)