palimpzest 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. palimpzest/__init__.py +7 -9
  2. palimpzest/constants.py +47 -7
  3. palimpzest/core/__init__.py +20 -26
  4. palimpzest/core/data/dataclasses.py +9 -2
  5. palimpzest/core/data/datareaders.py +497 -0
  6. palimpzest/core/elements/records.py +29 -37
  7. palimpzest/core/lib/fields.py +14 -12
  8. palimpzest/core/lib/schemas.py +80 -94
  9. palimpzest/policy.py +58 -0
  10. palimpzest/prompts/__init__.py +22 -0
  11. palimpzest/prompts/code_synthesis_prompts.py +28 -0
  12. palimpzest/prompts/convert_prompts.py +87 -0
  13. palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
  14. palimpzest/prompts/filter_prompts.py +69 -0
  15. palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
  16. palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
  17. palimpzest/prompts/prompt_factory.py +732 -0
  18. palimpzest/prompts/util_phrases.py +14 -0
  19. palimpzest/query/execution/execution_strategy.py +0 -3
  20. palimpzest/query/execution/parallel_execution_strategy.py +12 -25
  21. palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
  22. palimpzest/query/generators/generators.py +71 -347
  23. palimpzest/query/operators/__init__.py +5 -5
  24. palimpzest/query/operators/aggregate.py +10 -5
  25. palimpzest/query/operators/code_synthesis_convert.py +4 -48
  26. palimpzest/query/operators/convert.py +5 -2
  27. palimpzest/query/operators/critique_and_refine_convert.py +112 -0
  28. palimpzest/query/operators/filter.py +1 -1
  29. palimpzest/query/operators/limit.py +1 -1
  30. palimpzest/query/operators/logical.py +28 -27
  31. palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
  32. palimpzest/query/operators/physical.py +32 -20
  33. palimpzest/query/operators/project.py +1 -1
  34. palimpzest/query/operators/rag_convert.py +6 -3
  35. palimpzest/query/operators/retrieve.py +13 -31
  36. palimpzest/query/operators/scan.py +150 -0
  37. palimpzest/query/optimizer/__init__.py +5 -1
  38. palimpzest/query/optimizer/cost_model.py +18 -34
  39. palimpzest/query/optimizer/optimizer.py +40 -25
  40. palimpzest/query/optimizer/optimizer_strategy.py +26 -0
  41. palimpzest/query/optimizer/plan.py +2 -2
  42. palimpzest/query/optimizer/rules.py +118 -27
  43. palimpzest/query/processor/config.py +12 -1
  44. palimpzest/query/processor/mab_sentinel_processor.py +125 -112
  45. palimpzest/query/processor/nosentinel_processor.py +46 -62
  46. palimpzest/query/processor/query_processor.py +10 -20
  47. palimpzest/query/processor/query_processor_factory.py +12 -5
  48. palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
  49. palimpzest/query/processor/streaming_processor.py +11 -17
  50. palimpzest/sets.py +170 -94
  51. palimpzest/tools/pdfparser.py +5 -64
  52. palimpzest/utils/datareader_helpers.py +61 -0
  53. palimpzest/utils/field_helpers.py +69 -0
  54. palimpzest/utils/hash_helpers.py +3 -2
  55. palimpzest/utils/udfs.py +0 -28
  56. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
  57. palimpzest-0.6.0.dist-info/RECORD +87 -0
  58. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
  59. cli/README.md +0 -156
  60. cli/__init__.py +0 -0
  61. cli/cli_main.py +0 -390
  62. palimpzest/config.py +0 -89
  63. palimpzest/core/data/datasources.py +0 -369
  64. palimpzest/datamanager/__init__.py +0 -0
  65. palimpzest/datamanager/datamanager.py +0 -300
  66. palimpzest/prompts.py +0 -397
  67. palimpzest/query/operators/datasource.py +0 -202
  68. palimpzest-0.5.4.dist-info/RECORD +0 -83
  69. palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
  70. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
  71. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0
palimpzest/utils/udfs.py CHANGED
@@ -3,17 +3,12 @@ This file collects a sample of useful UDFs to convert schemata.
3
3
  """
4
4
 
5
5
  import io
6
- import json
7
6
  from datetime import datetime
8
7
 
9
- import modal
10
8
  import pandas as pd
11
9
  import requests
12
- from papermage import Document
13
10
 
14
11
  from palimpzest.constants import MAX_ROWS
15
- from palimpzest.datamanager.datamanager import DataDirectory
16
- from palimpzest.tools.pdfparser import get_text_from_pdf
17
12
 
18
13
 
19
14
  def url_to_file(candidate: dict):
@@ -30,29 +25,6 @@ def url_to_file(candidate: dict):
30
25
  return {"filename": filename, "timestamp": timestamp, "contents": contents}
31
26
 
32
27
 
33
- def file_to_pdf(candidate: dict):
34
- pdfprocessor = DataDirectory().current_config.get("pdfprocessor")
35
- if pdfprocessor == "modal":
36
- print("handling PDF processing remotely")
37
- remote_func = modal.Function.lookup("palimpzest.tools", "processPapermagePdf")
38
- else:
39
- remote_func = None
40
-
41
- pdf_bytes = candidate["contents"]
42
- # generate text_content from PDF
43
- if remote_func is not None:
44
- doc_json_str = remote_func.remote([pdf_bytes])
45
- docdict = json.loads(doc_json_str[0])
46
- doc = Document.from_json(docdict)
47
- text_content = ""
48
- for p in doc.pages:
49
- text_content += p.text
50
- else:
51
- text_content = get_text_from_pdf(candidate["filename"], candidate["contents"])
52
-
53
- return {"text_contents": text_content[:10000]} # TODO Very hacky
54
-
55
-
56
28
  def file_to_xls(candidate: dict):
57
29
  """Function used to convert a DataRecord instance of File to a XLSFile DataRecord."""
58
30
  xls = pd.ExcelFile(io.BytesIO(candidate["contents"]), engine="openpyxl")
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: palimpzest
3
- Version: 0.5.4
3
+ Version: 0.6.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
- Project-URL: homepage, https://github.com/mitdbg/palimpzest/
6
+ Project-URL: homepage, https://palimpzest.org
7
7
  Project-URL: repository, https://github.com/mitdbg/palimpzest/
8
+ Project-URL: documentation, https://palimpzest.org
8
9
  Keywords: relational,optimization,llm,AI programming,extraction,tools,document,search,integration
9
10
  Classifier: Development Status :: 4 - Beta
10
11
  Classifier: Intended Audience :: Developers
@@ -28,6 +29,10 @@ Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
28
29
  Requires-Dist: jupyter
29
30
  Requires-Dist: layoutparser>=0.3.4
30
31
  Requires-Dist: lxml-html-clean>=0.1.1
32
+ Requires-Dist: mkdocs>=1.6.1
33
+ Requires-Dist: mkdocs-material>=9.6.3
34
+ Requires-Dist: mkdocs-material[imaging]
35
+ Requires-Dist: mkdocstrings-python>=1.15.0
31
36
  Requires-Dist: modal>=0.62.198
32
37
  Requires-Dist: ncls==0.0.68
33
38
  Requires-Dist: necessary>=0.3.2
@@ -38,7 +43,6 @@ Requires-Dist: pandas>=2.1.1
38
43
  Requires-Dist: papermage>=0.16.0
39
44
  Requires-Dist: pdf2image
40
45
  Requires-Dist: pytest>=8.2.2
41
- Requires-Dist: pypdf==4.3.1
42
46
  Requires-Dist: python-Levenshtein
43
47
  Requires-Dist: pdfplumber==0.7.4
44
48
  Requires-Dist: pillow>=10.2.0
@@ -46,6 +50,7 @@ Requires-Dist: prettytable>=3.9.0
46
50
  Requires-Dist: PyLD>=2.0.4
47
51
  Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
48
52
  Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
53
+ Requires-Dist: pypdf>=5.1.0
49
54
  Requires-Dist: pytest-mock>=3.14.0
50
55
  Requires-Dist: python-Levenshtein>=0.25.1
51
56
  Requires-Dist: pyyaml>=6.0.1
@@ -66,12 +71,16 @@ Requires-Dist: sphinx>=8.1.3
66
71
  ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
67
72
 
68
73
  # Palimpzest (PZ)
69
- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696)
70
- [![Blog Post](https://img.shields.io/badge/Blog-PZ-green)](https://dsg.csail.mit.edu/projects/palimpzest/)
74
+ [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
75
+ [![Docs](https://img.shields.io/badge/Read_the_Docs-purple?logo=readthedocs)](https://palimpzest.org/)
71
76
  [![Colab Demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zqOxnh_G6eZ8_xax6PvDr-EjMt7hp4R5?usp=sharing)
72
- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu)
73
77
  [![PyPI](https://img.shields.io/pypi/v/palimpzest)](https://pypi.org/project/palimpzest/)
74
- [![PyPI - Monthly Downloads](https://img.shields.io/pypi/dm/palimpzest)](https://pypi.org/project/palimpzest/)
78
+ [![PyPI - Monthly Downloads](https://img.shields.io/pypi/dm/palimpzest?color=teal)](https://pypi.org/project/palimpzest/)
79
+ <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
80
+ <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
81
+
82
+ ## Learn How to Use PZ
83
+ Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
75
84
 
76
85
  ## Getting started
77
86
  You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
@@ -86,6 +95,17 @@ $ cd palimpzest
86
95
  $ pip install .
87
96
  ```
88
97
 
98
+ ## Join the PZ Community
99
+ We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
100
+
101
+ [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
102
+ - Get help with your PZ program(s)
103
+ - Give feedback to the maintainers
104
+ - Discuss the future direction(s) of the project
105
+ - Discuss anything related to data processing with LLMs!
106
+
107
+ We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
108
+
89
109
  ## Quick Start
90
110
  The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
91
111
  To run the notebook, you can use the following command:
@@ -97,48 +117,28 @@ And then access the notebook from the jupyter interface in your browser at `loca
97
117
  ### Even Quicker Start
98
118
  For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
99
119
  ```python
100
- import pandas as pd
101
- import palimpzest.datamanager.datamanager as pzdm
102
- from palimpzest.sets import Dataset
103
- from palimpzest.core.lib.fields import Field
104
- from palimpzest.core.lib.schemas import Schema, TextFile
105
- from palimpzest.policy import MinCost, MaxQuality
106
- from palimpzest.query.processor.config import QueryProcessorConfig
107
-
108
- # Dataset registration
109
- dataset_path = "testdata/enron-tiny"
110
- dataset_name = "enron-tiny"
111
- pzdm.DataDirectory().register_local_directory(dataset_path, dataset_name)
112
-
113
- # Dataset loading
114
- dataset = Dataset(dataset_name, schema=TextFile)
115
-
116
- # Schema definition for the fields we wish to compute
117
- class Email(Schema):
118
- """Represents an email, which in practice is usually from a text file"""
119
- sender = Field(desc="The email address of the sender")
120
- subject = Field(desc="The subject of the email")
121
- date = Field(desc="The date the email was sent")
122
-
123
- # Lazy construction of computation to filter for emails about holidays sent in July
124
- dataset = dataset.convert(Email, desc="An email from the Enron dataset")
125
- dataset = dataset.filter("The email was sent in July")
126
- dataset = dataset.filter("The email is about holidays")
127
-
128
- # Executing the compuation
129
- policy = MinCost()
130
- config = QueryProcessorConfig(
131
- policy=policy,
132
- verbose=True,
133
- processing_strategy="no_sentinel",
134
- execution_strategy="sequential",
135
- optimizer_strategy="pareto",
136
- )
137
- results, execution_stats = dataset.run(config)
138
-
139
- # Writing output to disk
140
- output_df = pd.DataFrame([r.to_dict() for r in results])[["date","sender","subject"]]
141
- output_df.to_csv("july_holiday_emails.csv")
120
+ import palimpzest as pz
121
+
122
+ # define the fields we wish to compute
123
+ email_cols = [
124
+ {"name": "sender", "type": str, "desc": "The email address of the sender"},
125
+ {"name": "subject", "type": str, "desc": "The subject of the email"},
126
+ {"name": "date", "type": str, "desc": "The date the email was sent"},
127
+ ]
128
+
129
+ # lazily construct the computation to get emails about holidays sent in July
130
+ dataset = pz.Dataset("testdata/enron-tiny/")
131
+ dataset = dataset.sem_add_columns(email_cols)
132
+ dataset = dataset.sem_filter("The email was sent in July")
133
+ dataset = dataset.sem_filter("The email is about holidays")
134
+
135
+ # execute the computation w/the MinCost policy
136
+ config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
137
+ output = dataset.run(config)
138
+
139
+ # display output (if using Jupyter, otherwise use print(output_df))
140
+ output_df = output.to_df(cols=["date", "sender", "subject"])
141
+ display(output_df)
142
142
  ```
143
143
 
144
144
  ## Palimpzest CLI
@@ -0,0 +1,87 @@
1
+ palimpzest/__init__.py,sha256=ZcxM-zzT3sX7cDTcx1tN7-Udet0lqoiDgnvlOs7nMhY,766
2
+ palimpzest/constants.py,sha256=SumDHOKDot25Sld0tCzWF6rs3oeLp42DFrDFNJQ8uoM,14035
3
+ palimpzest/policy.py,sha256=2cMio_AUfZv6lksr_klfP747G4w1nsZJtfmt6zjeaMk,12656
4
+ palimpzest/sets.py,sha256=LPbYSXf0LhsFXHCR7xq9nyBER0DBzcWM95Q6vhd3RvA,14180
5
+ palimpzest/core/__init__.py,sha256=XJQxijqc68kWa44-me5ZIij6PKjOspxaeZKKrVhEjVo,1472
6
+ palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ palimpzest/core/data/dataclasses.py,sha256=biRtaxwy1ALeo50jykKx2_J3bXE9qbu0cwM2RtoZlzo,19486
8
+ palimpzest/core/data/datareaders.py,sha256=9ZNLp2j7g8_f0YDTqavLpMhXtxh5O4efprSZNkwpDkA,17323
9
+ palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ palimpzest/core/elements/filters.py,sha256=zBBYRUnPfEYb85IfC04TJkS45GxgL5KfXASIHWwlwow,1554
11
+ palimpzest/core/elements/groupbysig.py,sha256=1qHuR2-fcW-E4rxPSieYGSXZYwvFaPwf1ld9VPWvWjw,2233
12
+ palimpzest/core/elements/records.py,sha256=wEQnk2-TygII-2h0j9ag91AUE2heDAD5ohEBCSjhNZ8,13801
13
+ palimpzest/core/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ palimpzest/core/lib/fields.py,sha256=QW99ya_VffHMnaef0LP97isYp8w59YI9FInGMy_M8M8,4072
15
+ palimpzest/core/lib/schemas.py,sha256=g3yH6RxgN0sbsW7YkklHAnrHjlbK_2Am-Mt6inw7f6U,16977
16
+ palimpzest/prompts/__init__.py,sha256=klF8bYcNZWTQyuX8ZaqUXmD8Syq-MOHSRKemiwyM4N4,680
17
+ palimpzest/prompts/code_synthesis_prompts.py,sha256=8mlMTPAI5WsoG0LVohoBFL-dnOro-mP3VJgEAiwgxnU,1472
18
+ palimpzest/prompts/convert_prompts.py,sha256=mUt2TkSerAYuYyDg7LC4AQ195Zz-zoZjA0AN_yMH9MQ,3595
19
+ palimpzest/prompts/critique_and_refine_convert_prompts.py,sha256=WoXExBxQ7twswd9VCCST26c-2ehZtpD2iQoBi7sqDnQ,7814
20
+ palimpzest/prompts/filter_prompts.py,sha256=iQjn-39h3L0E5wng_UPgAXRHrP1ok329TXpOgZ6Wn1w,2372
21
+ palimpzest/prompts/moa_aggregator_convert_prompts.py,sha256=BQRrtGdr53PTqvXzmFh8kfQ_w9KoKw-zTtmdo-8RFjo,2887
22
+ palimpzest/prompts/moa_proposer_convert_prompts.py,sha256=d_hOh0-0m6HWBDAxUu7W3WyQtSTlUvqio3nzpnX2bxM,3642
23
+ palimpzest/prompts/prompt_factory.py,sha256=VzZNH9kblFXYn4YKVKudJ21Y5Q-3tL6ZgFmNhBNTGjQ,31921
24
+ palimpzest/prompts/util_phrases.py,sha256=NWrcHfjJyiOY16Jyt7R50moVnlJDyvSBZ9kBqyX2WQo,751
25
+ palimpzest/query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ palimpzest/query/execution/execution_strategy.py,sha256=HdiZ0nhMON0FyCzNoPhUgZnJgRstUAIZ99a33k_04BI,2443
28
+ palimpzest/query/execution/parallel_execution_strategy.py,sha256=8_0B_MYoLERVhQqllLHOkQ6OTIJqc6VRHPftRLlx7_s,10974
29
+ palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=7SWr-cOJARBkLeDE_0UF45XQNwetWZi76p_iIUXO0xU,13469
30
+ palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ palimpzest/query/generators/generators.py,sha256=ktd-NDwaDf7W-t3d8qa3G2DX3YvuaPXVr9ZqGPr7AkE,18784
32
+ palimpzest/query/operators/__init__.py,sha256=a_Jk_1LzaoNQHY4b5qdHZmF6kG4g1xY8lO-ZxZHkrUQ,3285
33
+ palimpzest/query/operators/aggregate.py,sha256=nVLgJkB8oWj4Urclr8TV2w0fL9LZwU6x0Cf4dxMy19A,10212
34
+ palimpzest/query/operators/code_synthesis_convert.py,sha256=wWHAH83fpC99uleI3tVHPBGyrqAZax5jU7xJ5MgNntI,20948
35
+ palimpzest/query/operators/convert.py,sha256=aVZCo5R-nDs2n8J9h8x7JCk7WIYgx5zwldUlQZOyD3I,19905
36
+ palimpzest/query/operators/critique_and_refine_convert.py,sha256=nrNUX9rtZRQ40XdUXbqxS5_R3ThnpXN9d95Vy4XWGCI,5229
37
+ palimpzest/query/operators/filter.py,sha256=TXZAjNW9xiMV17Z4E83m5OTXT_0BOdLcfmFEbBOUqO4,10372
38
+ palimpzest/query/operators/limit.py,sha256=xnGC6zmHdPm_2YCtsVRBL2iwXcUB1lP_vsEkuHV4nmY,2103
39
+ palimpzest/query/operators/logical.py,sha256=Mx7Q12Cf2sr6Xr-PbtLlNLAVjC6lwV7WLk77pC3Gvs4,14708
40
+ palimpzest/query/operators/mixture_of_agents_convert.py,sha256=_RVmZ4gRzgM6bRgsSzDwU8JrGSuA1zotxKO1wRhUDhc,6628
41
+ palimpzest/query/operators/physical.py,sha256=2AOqj0D79-g3IVVnsyBH9g_F9NNHb1j-iOhogIEx8eY,8346
42
+ palimpzest/query/operators/project.py,sha256=djlKXCkz2b-h1phsD8tWqewcTLKBfWMgsyZ52oFN2MY,2084
43
+ palimpzest/query/operators/rag_convert.py,sha256=vUcmgNyrJO8KJYRbs15BVUNHDi8xF-n_UO5Pou9V5zc,8445
44
+ palimpzest/query/operators/retrieve.py,sha256=VrwtCT4Bgv4ipPmFoAgSLK93dilquKMi_hE6GKy1UNs,3972
45
+ palimpzest/query/operators/scan.py,sha256=z6wUVxuhr5VqPIeUxb3hxhkaljKpDc_exzGMe4NMgxY,5728
46
+ palimpzest/query/operators/token_reduction_convert.py,sha256=yy9GYMPt-LQxPdwIgVyhCb9hi_8FRorGU8XqK_3jq9g,8513
47
+ palimpzest/query/optimizer/__init__.py,sha256=pl1co0dCwDZkAQ-0oiwT81GjvB0Oc59WiwmmYF8k73s,3109
48
+ palimpzest/query/optimizer/cost_model.py,sha256=zSK2Nsya96pR5Zh67cr_O5q0qtPa08--Tchn0cYvE58,44837
49
+ palimpzest/query/optimizer/optimizer.py,sha256=lBfNYgbyyE_0bdZCGnz9oicyG3gFUdkRnzcyJ31_36o,20644
50
+ palimpzest/query/optimizer/optimizer_strategy.py,sha256=-1xx_cviSJw6PH8XiQQK9qe4YPnAmxZEAhNVKdxRgH4,12894
51
+ palimpzest/query/optimizer/plan.py,sha256=xlWB3sY5qDac3o6IHoWcuGK5Azv-4C2_zKKx4PzxEh4,5768
52
+ palimpzest/query/optimizer/primitives.py,sha256=ikaX8YcDM3IrxKt98OX-mYujRYQtdMlDgsFKyjchMMA,4061
53
+ palimpzest/query/optimizer/rules.py,sha256=jpwSI_xCzkdML4PNQScDNGzXExqont2AhQFfL4Eumdg,44059
54
+ palimpzest/query/optimizer/tasks.py,sha256=ORyPpAbbVAUjkxh3WyDYw2I8Z6RfQLUsLGOh5987zTI,28058
55
+ palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ palimpzest/query/processor/config.py,sha256=kOhBxAZ3OeDDlQ2qMII1i2EorFpSSQbEFVFrNk-3F-o,3226
57
+ palimpzest/query/processor/mab_sentinel_processor.py,sha256=xShPVW8ejhNR_wQ8ofPF2hL7iGj8tlFhsc2wiiIPb9Y,45600
58
+ palimpzest/query/processor/nosentinel_processor.py,sha256=alPq1tnZvqxCSO5LYRCjlF4CB4v7NbzH_BHB-DSuehI,26478
59
+ palimpzest/query/processor/query_processor.py,sha256=aR0OBmaZZt4_KSBjHy_KCT6pIBI8WTfT8TTcIkgPBt4,11109
60
+ palimpzest/query/processor/query_processor_factory.py,sha256=663_V-AJK0VsBZNwgnqYu84g0rmtKf-U-xJWnps3XWs,8239
61
+ palimpzest/query/processor/random_sampling_sentinel_processor.py,sha256=VFC0HFY3OLxAjpdmBt41K0rw8C_1ylECFjbRYsui3rU,30440
62
+ palimpzest/query/processor/streaming_processor.py,sha256=4-XvgAjUTnO3Dgdxm9VSw4udREjNWTW526Rggy5Do7s,6501
63
+ palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ palimpzest/schemabuilder/schema_builder.py,sha256=kGEv-Adba-FNziRrlG0zwx317IuD7rmzNl2GecvnbDw,8528
65
+ palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
66
+ palimpzest/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ palimpzest/tools/allenpdf.py,sha256=fXMOmSDdSSLXDKAPYYJ8k4egtWEBf_Me9Lq9tM3iyoA,1690
68
+ palimpzest/tools/pdfparser.py,sha256=0DOVUZLxYfqjxM8WNEfYcyiXb1qW9BWVIHEB_B_YhWA,9570
69
+ palimpzest/tools/skema_tools.py,sha256=HXUFpjMhbVxZwKKkATeK-FwtlTCawaCbeP-uHntI1Kg,669
70
+ palimpzest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ palimpzest/utils/datareader_helpers.py,sha256=-tkIf9iOF7mr-gyrrUQqnHWwRig4OGnowU0Wpx5HKG0,2121
72
+ palimpzest/utils/demo_helpers.py,sha256=BcNgtTz4O9iGriefy6f26BtJd_G5SQPzD3oQg_qLUdU,2522
73
+ palimpzest/utils/env_helpers.py,sha256=n81KzoJ459pRxo7QmJA7duazwWsfoMGTHc71D2LatFk,334
74
+ palimpzest/utils/field_helpers.py,sha256=Op18ThAnDlALiAkquUQbelHodZZYg378Ct1I8eIkKio,2291
75
+ palimpzest/utils/generation_helpers.py,sha256=jveE9iQQtUQ94nuU6c1zuWoQMkwizr037S8si4n35jo,3230
76
+ palimpzest/utils/hash_helpers.py,sha256=3A8dA7SbXTwnnvZvPVNqqMLlVRhCKyKF_bjNNAu3Exk,334
77
+ palimpzest/utils/index_helpers.py,sha256=7webOjV2vYF7UJ_YsNdoX5OyR1zJ6lSLWO1mQSGWz0Q,123
78
+ palimpzest/utils/model_helpers.py,sha256=dZdMkZ6zOBqG3uBCkmzXG1yQAoaGL3wF6lNSgnvigEQ,2399
79
+ palimpzest/utils/progress.py,sha256=GYmPUBdG7xmqbqj1UiSNP-pWZKmRMLX797MBgrOPugM,7214
80
+ palimpzest/utils/sandbox.py,sha256=Ge96gmzqeOGlNkMCG9A95_PB8wRQbvTFua136of8FcA,6465
81
+ palimpzest/utils/token_reduction_helpers.py,sha256=Ob95PcqCsbGLiBdQ-4YQsWGWRppb2hvQyt0gi1fzL-Y,3855
82
+ palimpzest/utils/udfs.py,sha256=LjHic54B1az-rKgNLur0wOpaz2ko_UodjLEJrazkxvY,1854
83
+ palimpzest-0.6.0.dist-info/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
84
+ palimpzest-0.6.0.dist-info/METADATA,sha256=AdFeZfrufXEoY-U2w9b2As8LqIaK2UpsEGN7nXERwVE,7837
85
+ palimpzest-0.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
86
+ palimpzest-0.6.0.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
87
+ palimpzest-0.6.0.dist-info/RECORD,,
cli/README.md DELETED
@@ -1,156 +0,0 @@
1
- ## Palimpzest CLI
2
- Installing Palimpzest also installs its CLI tool `pz` which provides users with basic utilities for creating and managing their own Palimpzest system. Running `pz --help` diplays an overview of the CLI's commands:
3
- ```bash
4
- $ pz --help
5
- Usage: pz [OPTIONS] COMMAND [ARGS]...
6
-
7
- The CLI tool for Palimpzest.
8
-
9
- Options:
10
- --help Show this message and exit.
11
-
12
- Commands:
13
- help (h) Print the help message for PZ.
14
- init (i) Initialize data directory for PZ.
15
- ls-data (ls,lsdata) Print a table listing the datasets
16
- registered with PZ.
17
- register-data (r,reg,register) Register a data file or data directory with
18
- PZ.
19
- rm-data (rm,rmdata) Remove a dataset that was registered with
20
- PZ.
21
- ```
22
-
23
- Users can initialize their own system by running `pz init`. This will create Palimpzest's working directory in `~/.palimpzest`:
24
- ```bash
25
- $ pz init
26
- Palimpzest system initialized in: /Users/matthewrusso/.palimpzest
27
- ```
28
-
29
- If we list the set of datasets registered with Palimpzest, we'll see there currently are none:
30
- ```bash
31
- $ pz ls
32
- +------+------+------+
33
- | Name | Type | Path |
34
- +------+------+------+
35
- +------+------+------+
36
-
37
- Total datasets: 0
38
- ```
39
-
40
- ### Registering Datasets
41
- To add (or "register") a dataset with Palimpzest, we can use the `pz register-data` command (also aliased as `pz reg`) to specify that a file or directory at a given `--path` should be registered as a dataset with the specified `--name`:
42
- ```bash
43
- $ pz reg --path README.md --name rdme
44
- Registered rdme
45
- ```
46
-
47
- If we list Palimpzest's datasets again we will see that `README.md` has been registered under the dataset named `rdme`:
48
- ```bash
49
- $ pz ls
50
- +------+------+------------------------------------------+
51
- | Name | Type | Path |
52
- +------+------+------------------------------------------+
53
- | rdme | file | /Users/matthewrusso/palimpzest/README.md |
54
- +------+------+------------------------------------------+
55
-
56
- Total datasets: 1
57
- ```
58
-
59
- To remove a dataset from Palimpzest, simply use the `pz rm-data` command (also aliased as `pz rm`) and specify the `--name` of the dataset you would like to remove:
60
- ```bash
61
- $ pz rm --name rdme
62
- Deleted rdme
63
- ```
64
-
65
- Finally, listing our datasets once more will show that the dataset has been deleted:
66
- ```bash
67
- $ pz ls
68
- +------+------+------+
69
- | Name | Type | Path |
70
- +------+------+------+
71
- +------+------+------+
72
-
73
- Total datasets: 0
74
- ```
75
-
76
- ### Cache Management
77
- Palimpzest will cache intermediate results by default. It can be useful to remove them from the cache when trying to evaluate the performance improvement(s) of code changes. We provide a utility command `pz clear-cache` (also aliased as `pz clr`) to clear the cache:
78
- ```bash
79
- $ pz clr
80
- Cache cleared
81
- ```
82
-
83
- ### Config Management
84
- You may wish to work with multiple configurations of Palimpzest in order to, e.g., evaluate the difference in performance between various LLM services for your data extraction task. To see the config Palimpzest is currently using, you can run the `pz print-config` command (also aliased as `pz config`):
85
- ```bash
86
- $ pz config
87
- --- default ---
88
- filecachedir: /some/local/filepath
89
- llmservice: openai
90
- name: default
91
- parallel: false
92
- ```
93
- By default, Palimpzest uses the configuration named `default`. As shown above, if you run a script using Palimpzest out-of-the-box, it will use OpenAI endpoints for all of its API calls.
94
-
95
- Now, let's say you wanted to try using [together.ai's](https://www.together.ai/) for your API calls, you could do this by creating a new config with the `pz create-config` command (also aliased as `pz cc`):
96
- ```bash
97
- $ pz cc --name together-conf --llmservice together --parallel True --set
98
- Created and set config: together-conf
99
- ```
100
- The `--name` parameter is required and specifies the unique name for your config. The `--llmservice` and `--parallel` options specify the service to use and whether or not to process files in parallel. Finally, if the `--set` flag is present, Palimpzest will update its current config to point to the newly created config.
101
-
102
- We can confirm that Palimpzest checked out our new config by running `pz config`:
103
- ```bash
104
- $ pz config
105
- --- together-conf ---
106
- filecachedir: /some/local/filepath
107
- llmservice: together
108
- name: together-conf
109
- parallel: true
110
- ```
111
-
112
- You can switch which config you are using at any time by using the `pz set-config` command (also aliased as `pz set`):
113
- ```bash
114
- $ pz set --name default
115
- Set config: default
116
-
117
- $ pz config
118
- --- default ---
119
- filecachedir: /some/local/filepath
120
- llmservice: openai
121
- name: default
122
- parallel: false
123
-
124
- $ pz set --name together-conf
125
- Set config: together-conf
126
-
127
- $ pz config
128
- --- together-conf ---
129
- filecachedir: /some/local/filepath
130
- llmservice: together
131
- name: together-conf
132
- parallel: true
133
- ```
134
-
135
- You can update an existing config using the `pz update` command (also aliased as `pz uc`):
136
- ```bash
137
- $ pz update --name default --settings parallel=true,pdfprocessor=pdfplumber
138
- Updated config: default
139
-
140
- $ pz config
141
- --- default ---
142
- filecachedir: /some/local/filepath
143
- llmservice: anthropic
144
- name: default
145
- parallel: true
146
- pdfprocessor: pdfplumber
147
- ```
148
-
149
- The `--name` parameter specifies which config to update. `--settings` specifies all the parameter name and value pairs in the format `param_name=param_value`, separated by commas.
150
-
151
- Finally, you can delete a config with the `pz rm-config` command (also aliased as `pz rmc`):
152
- ```bash
153
- $ pz rmc --name together-conf
154
- Deleted config: together-conf
155
- ```
156
- Note that you cannot delete the `default` config, and if you delete the config that you currently have set, Palimpzest will set the current config to be `default`.
cli/__init__.py DELETED
File without changes