palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +7 -9
- palimpzest/constants.py +47 -7
- palimpzest/core/__init__.py +20 -26
- palimpzest/core/data/dataclasses.py +9 -2
- palimpzest/core/data/datareaders.py +497 -0
- palimpzest/core/elements/records.py +29 -37
- palimpzest/core/lib/fields.py +14 -12
- palimpzest/core/lib/schemas.py +80 -94
- palimpzest/policy.py +58 -0
- palimpzest/prompts/__init__.py +22 -0
- palimpzest/prompts/code_synthesis_prompts.py +28 -0
- palimpzest/prompts/convert_prompts.py +87 -0
- palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
- palimpzest/prompts/filter_prompts.py +69 -0
- palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
- palimpzest/prompts/prompt_factory.py +732 -0
- palimpzest/prompts/util_phrases.py +14 -0
- palimpzest/query/execution/execution_strategy.py +0 -3
- palimpzest/query/execution/parallel_execution_strategy.py +12 -25
- palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
- palimpzest/query/generators/generators.py +71 -347
- palimpzest/query/operators/__init__.py +5 -5
- palimpzest/query/operators/aggregate.py +10 -5
- palimpzest/query/operators/code_synthesis_convert.py +4 -48
- palimpzest/query/operators/convert.py +5 -2
- palimpzest/query/operators/critique_and_refine_convert.py +112 -0
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/logical.py +28 -27
- palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
- palimpzest/query/operators/physical.py +32 -20
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/rag_convert.py +6 -3
- palimpzest/query/operators/retrieve.py +13 -31
- palimpzest/query/operators/scan.py +150 -0
- palimpzest/query/optimizer/__init__.py +5 -1
- palimpzest/query/optimizer/cost_model.py +18 -34
- palimpzest/query/optimizer/optimizer.py +40 -25
- palimpzest/query/optimizer/optimizer_strategy.py +26 -0
- palimpzest/query/optimizer/plan.py +2 -2
- palimpzest/query/optimizer/rules.py +118 -27
- palimpzest/query/processor/config.py +12 -1
- palimpzest/query/processor/mab_sentinel_processor.py +125 -112
- palimpzest/query/processor/nosentinel_processor.py +46 -62
- palimpzest/query/processor/query_processor.py +10 -20
- palimpzest/query/processor/query_processor_factory.py +12 -5
- palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
- palimpzest/query/processor/streaming_processor.py +11 -17
- palimpzest/sets.py +170 -94
- palimpzest/tools/pdfparser.py +5 -64
- palimpzest/utils/datareader_helpers.py +61 -0
- palimpzest/utils/field_helpers.py +69 -0
- palimpzest/utils/hash_helpers.py +3 -2
- palimpzest/utils/udfs.py +0 -28
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
- palimpzest-0.6.1.dist-info/RECORD +87 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
- cli/README.md +0 -156
- cli/__init__.py +0 -0
- cli/cli_main.py +0 -390
- palimpzest/config.py +0 -89
- palimpzest/core/data/datasources.py +0 -369
- palimpzest/datamanager/__init__.py +0 -0
- palimpzest/datamanager/datamanager.py +0 -300
- palimpzest/prompts.py +0 -397
- palimpzest/query/operators/datasource.py +0 -202
- palimpzest-0.5.4.dist-info/RECORD +0 -83
- palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0
palimpzest/utils/udfs.py
CHANGED
|
@@ -3,17 +3,12 @@ This file collects a sample of useful UDFs to convert schemata.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import io
|
|
6
|
-
import json
|
|
7
6
|
from datetime import datetime
|
|
8
7
|
|
|
9
|
-
import modal
|
|
10
8
|
import pandas as pd
|
|
11
9
|
import requests
|
|
12
|
-
from papermage import Document
|
|
13
10
|
|
|
14
11
|
from palimpzest.constants import MAX_ROWS
|
|
15
|
-
from palimpzest.datamanager.datamanager import DataDirectory
|
|
16
|
-
from palimpzest.tools.pdfparser import get_text_from_pdf
|
|
17
12
|
|
|
18
13
|
|
|
19
14
|
def url_to_file(candidate: dict):
|
|
@@ -30,29 +25,6 @@ def url_to_file(candidate: dict):
|
|
|
30
25
|
return {"filename": filename, "timestamp": timestamp, "contents": contents}
|
|
31
26
|
|
|
32
27
|
|
|
33
|
-
def file_to_pdf(candidate: dict):
|
|
34
|
-
pdfprocessor = DataDirectory().current_config.get("pdfprocessor")
|
|
35
|
-
if pdfprocessor == "modal":
|
|
36
|
-
print("handling PDF processing remotely")
|
|
37
|
-
remote_func = modal.Function.lookup("palimpzest.tools", "processPapermagePdf")
|
|
38
|
-
else:
|
|
39
|
-
remote_func = None
|
|
40
|
-
|
|
41
|
-
pdf_bytes = candidate["contents"]
|
|
42
|
-
# generate text_content from PDF
|
|
43
|
-
if remote_func is not None:
|
|
44
|
-
doc_json_str = remote_func.remote([pdf_bytes])
|
|
45
|
-
docdict = json.loads(doc_json_str[0])
|
|
46
|
-
doc = Document.from_json(docdict)
|
|
47
|
-
text_content = ""
|
|
48
|
-
for p in doc.pages:
|
|
49
|
-
text_content += p.text
|
|
50
|
-
else:
|
|
51
|
-
text_content = get_text_from_pdf(candidate["filename"], candidate["contents"])
|
|
52
|
-
|
|
53
|
-
return {"text_contents": text_content[:10000]} # TODO Very hacky
|
|
54
|
-
|
|
55
|
-
|
|
56
28
|
def file_to_xls(candidate: dict):
|
|
57
29
|
"""Function used to convert a DataRecord instance of File to a XLSFile DataRecord."""
|
|
58
30
|
xls = pd.ExcelFile(io.BytesIO(candidate["contents"]), engine="openpyxl")
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
|
-
Project-URL: homepage, https://
|
|
6
|
+
Project-URL: homepage, https://palimpzest.org
|
|
7
7
|
Project-URL: repository, https://github.com/mitdbg/palimpzest/
|
|
8
|
+
Project-URL: documentation, https://palimpzest.org
|
|
8
9
|
Keywords: relational,optimization,llm,AI programming,extraction,tools,document,search,integration
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
11
|
Classifier: Intended Audience :: Developers
|
|
@@ -28,6 +29,10 @@ Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
|
|
|
28
29
|
Requires-Dist: jupyter
|
|
29
30
|
Requires-Dist: layoutparser>=0.3.4
|
|
30
31
|
Requires-Dist: lxml-html-clean>=0.1.1
|
|
32
|
+
Requires-Dist: mkdocs>=1.6.1
|
|
33
|
+
Requires-Dist: mkdocs-material>=9.6.3
|
|
34
|
+
Requires-Dist: mkdocs-material[imaging]
|
|
35
|
+
Requires-Dist: mkdocstrings-python>=1.15.0
|
|
31
36
|
Requires-Dist: modal>=0.62.198
|
|
32
37
|
Requires-Dist: ncls==0.0.68
|
|
33
38
|
Requires-Dist: necessary>=0.3.2
|
|
@@ -38,7 +43,6 @@ Requires-Dist: pandas>=2.1.1
|
|
|
38
43
|
Requires-Dist: papermage>=0.16.0
|
|
39
44
|
Requires-Dist: pdf2image
|
|
40
45
|
Requires-Dist: pytest>=8.2.2
|
|
41
|
-
Requires-Dist: pypdf==4.3.1
|
|
42
46
|
Requires-Dist: python-Levenshtein
|
|
43
47
|
Requires-Dist: pdfplumber==0.7.4
|
|
44
48
|
Requires-Dist: pillow>=10.2.0
|
|
@@ -46,6 +50,7 @@ Requires-Dist: prettytable>=3.9.0
|
|
|
46
50
|
Requires-Dist: PyLD>=2.0.4
|
|
47
51
|
Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
|
|
48
52
|
Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
|
|
53
|
+
Requires-Dist: pypdf>=5.1.0
|
|
49
54
|
Requires-Dist: pytest-mock>=3.14.0
|
|
50
55
|
Requires-Dist: python-Levenshtein>=0.25.1
|
|
51
56
|
Requires-Dist: pyyaml>=6.0.1
|
|
@@ -66,12 +71,16 @@ Requires-Dist: sphinx>=8.1.3
|
|
|
66
71
|

|
|
67
72
|
|
|
68
73
|
# Palimpzest (PZ)
|
|
69
|
-
[](https://discord.gg/dN85JJ6jaH)
|
|
75
|
+
[](https://palimpzest.org/)
|
|
71
76
|
[](https://colab.research.google.com/drive/1zqOxnh_G6eZ8_xax6PvDr-EjMt7hp4R5?usp=sharing)
|
|
72
|
-
[](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu)
|
|
73
77
|
[](https://pypi.org/project/palimpzest/)
|
|
74
|
-
[](https://pypi.org/project/palimpzest/)
|
|
78
|
+
[](https://pypi.org/project/palimpzest/)
|
|
79
|
+
<!-- [](https://arxiv.org/pdf/2405.14696) -->
|
|
80
|
+
<!-- [](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
|
|
81
|
+
|
|
82
|
+
## Learn How to Use PZ
|
|
83
|
+
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
|
|
75
84
|
|
|
76
85
|
## Getting started
|
|
77
86
|
You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
|
|
@@ -86,6 +95,17 @@ $ cd palimpzest
|
|
|
86
95
|
$ pip install .
|
|
87
96
|
```
|
|
88
97
|
|
|
98
|
+
## Join the PZ Community
|
|
99
|
+
We are actively hacking on PZ and would love to have you join our community [](https://discord.gg/dN85JJ6jaH)
|
|
100
|
+
|
|
101
|
+
[Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
|
|
102
|
+
- Get help with your PZ program(s)
|
|
103
|
+
- Give feedback to the maintainers
|
|
104
|
+
- Discuss the future direction(s) of the project
|
|
105
|
+
- Discuss anything related to data processing with LLMs!
|
|
106
|
+
|
|
107
|
+
We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
|
|
108
|
+
|
|
89
109
|
## Quick Start
|
|
90
110
|
The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
|
|
91
111
|
To run the notebook, you can use the following command:
|
|
@@ -97,48 +117,28 @@ And then access the notebook from the jupyter interface in your browser at `loca
|
|
|
97
117
|
### Even Quicker Start
|
|
98
118
|
For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
|
|
99
119
|
```python
|
|
100
|
-
import
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
# Lazy construction of computation to filter for emails about holidays sent in July
|
|
124
|
-
dataset = dataset.convert(Email, desc="An email from the Enron dataset")
|
|
125
|
-
dataset = dataset.filter("The email was sent in July")
|
|
126
|
-
dataset = dataset.filter("The email is about holidays")
|
|
127
|
-
|
|
128
|
-
# Executing the compuation
|
|
129
|
-
policy = MinCost()
|
|
130
|
-
config = QueryProcessorConfig(
|
|
131
|
-
policy=policy,
|
|
132
|
-
verbose=True,
|
|
133
|
-
processing_strategy="no_sentinel",
|
|
134
|
-
execution_strategy="sequential",
|
|
135
|
-
optimizer_strategy="pareto",
|
|
136
|
-
)
|
|
137
|
-
results, execution_stats = dataset.run(config)
|
|
138
|
-
|
|
139
|
-
# Writing output to disk
|
|
140
|
-
output_df = pd.DataFrame([r.to_dict() for r in results])[["date","sender","subject"]]
|
|
141
|
-
output_df.to_csv("july_holiday_emails.csv")
|
|
120
|
+
import palimpzest as pz
|
|
121
|
+
|
|
122
|
+
# define the fields we wish to compute
|
|
123
|
+
email_cols = [
|
|
124
|
+
{"name": "sender", "type": str, "desc": "The email address of the sender"},
|
|
125
|
+
{"name": "subject", "type": str, "desc": "The subject of the email"},
|
|
126
|
+
{"name": "date", "type": str, "desc": "The date the email was sent"},
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
# lazily construct the computation to get emails about holidays sent in July
|
|
130
|
+
dataset = pz.Dataset("testdata/enron-tiny/")
|
|
131
|
+
dataset = dataset.sem_add_columns(email_cols)
|
|
132
|
+
dataset = dataset.sem_filter("The email was sent in July")
|
|
133
|
+
dataset = dataset.sem_filter("The email is about holidays")
|
|
134
|
+
|
|
135
|
+
# execute the computation w/the MinCost policy
|
|
136
|
+
config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
|
|
137
|
+
output = dataset.run(config)
|
|
138
|
+
|
|
139
|
+
# display output (if using Jupyter, otherwise use print(output_df))
|
|
140
|
+
output_df = output.to_df(cols=["date", "sender", "subject"])
|
|
141
|
+
display(output_df)
|
|
142
142
|
```
|
|
143
143
|
|
|
144
144
|
## Palimpzest CLI
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
palimpzest/__init__.py,sha256=ZcxM-zzT3sX7cDTcx1tN7-Udet0lqoiDgnvlOs7nMhY,766
|
|
2
|
+
palimpzest/constants.py,sha256=SumDHOKDot25Sld0tCzWF6rs3oeLp42DFrDFNJQ8uoM,14035
|
|
3
|
+
palimpzest/policy.py,sha256=2cMio_AUfZv6lksr_klfP747G4w1nsZJtfmt6zjeaMk,12656
|
|
4
|
+
palimpzest/sets.py,sha256=LPbYSXf0LhsFXHCR7xq9nyBER0DBzcWM95Q6vhd3RvA,14180
|
|
5
|
+
palimpzest/core/__init__.py,sha256=XJQxijqc68kWa44-me5ZIij6PKjOspxaeZKKrVhEjVo,1472
|
|
6
|
+
palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
palimpzest/core/data/dataclasses.py,sha256=biRtaxwy1ALeo50jykKx2_J3bXE9qbu0cwM2RtoZlzo,19486
|
|
8
|
+
palimpzest/core/data/datareaders.py,sha256=9ZNLp2j7g8_f0YDTqavLpMhXtxh5O4efprSZNkwpDkA,17323
|
|
9
|
+
palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
palimpzest/core/elements/filters.py,sha256=zBBYRUnPfEYb85IfC04TJkS45GxgL5KfXASIHWwlwow,1554
|
|
11
|
+
palimpzest/core/elements/groupbysig.py,sha256=1qHuR2-fcW-E4rxPSieYGSXZYwvFaPwf1ld9VPWvWjw,2233
|
|
12
|
+
palimpzest/core/elements/records.py,sha256=wEQnk2-TygII-2h0j9ag91AUE2heDAD5ohEBCSjhNZ8,13801
|
|
13
|
+
palimpzest/core/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
palimpzest/core/lib/fields.py,sha256=QW99ya_VffHMnaef0LP97isYp8w59YI9FInGMy_M8M8,4072
|
|
15
|
+
palimpzest/core/lib/schemas.py,sha256=g3yH6RxgN0sbsW7YkklHAnrHjlbK_2Am-Mt6inw7f6U,16977
|
|
16
|
+
palimpzest/prompts/__init__.py,sha256=klF8bYcNZWTQyuX8ZaqUXmD8Syq-MOHSRKemiwyM4N4,680
|
|
17
|
+
palimpzest/prompts/code_synthesis_prompts.py,sha256=8mlMTPAI5WsoG0LVohoBFL-dnOro-mP3VJgEAiwgxnU,1472
|
|
18
|
+
palimpzest/prompts/convert_prompts.py,sha256=mUt2TkSerAYuYyDg7LC4AQ195Zz-zoZjA0AN_yMH9MQ,3595
|
|
19
|
+
palimpzest/prompts/critique_and_refine_convert_prompts.py,sha256=WoXExBxQ7twswd9VCCST26c-2ehZtpD2iQoBi7sqDnQ,7814
|
|
20
|
+
palimpzest/prompts/filter_prompts.py,sha256=iQjn-39h3L0E5wng_UPgAXRHrP1ok329TXpOgZ6Wn1w,2372
|
|
21
|
+
palimpzest/prompts/moa_aggregator_convert_prompts.py,sha256=BQRrtGdr53PTqvXzmFh8kfQ_w9KoKw-zTtmdo-8RFjo,2887
|
|
22
|
+
palimpzest/prompts/moa_proposer_convert_prompts.py,sha256=d_hOh0-0m6HWBDAxUu7W3WyQtSTlUvqio3nzpnX2bxM,3642
|
|
23
|
+
palimpzest/prompts/prompt_factory.py,sha256=VzZNH9kblFXYn4YKVKudJ21Y5Q-3tL6ZgFmNhBNTGjQ,31921
|
|
24
|
+
palimpzest/prompts/util_phrases.py,sha256=NWrcHfjJyiOY16Jyt7R50moVnlJDyvSBZ9kBqyX2WQo,751
|
|
25
|
+
palimpzest/query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
palimpzest/query/execution/execution_strategy.py,sha256=HdiZ0nhMON0FyCzNoPhUgZnJgRstUAIZ99a33k_04BI,2443
|
|
28
|
+
palimpzest/query/execution/parallel_execution_strategy.py,sha256=8_0B_MYoLERVhQqllLHOkQ6OTIJqc6VRHPftRLlx7_s,10974
|
|
29
|
+
palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=7SWr-cOJARBkLeDE_0UF45XQNwetWZi76p_iIUXO0xU,13469
|
|
30
|
+
palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
palimpzest/query/generators/generators.py,sha256=ktd-NDwaDf7W-t3d8qa3G2DX3YvuaPXVr9ZqGPr7AkE,18784
|
|
32
|
+
palimpzest/query/operators/__init__.py,sha256=a_Jk_1LzaoNQHY4b5qdHZmF6kG4g1xY8lO-ZxZHkrUQ,3285
|
|
33
|
+
palimpzest/query/operators/aggregate.py,sha256=nVLgJkB8oWj4Urclr8TV2w0fL9LZwU6x0Cf4dxMy19A,10212
|
|
34
|
+
palimpzest/query/operators/code_synthesis_convert.py,sha256=wWHAH83fpC99uleI3tVHPBGyrqAZax5jU7xJ5MgNntI,20948
|
|
35
|
+
palimpzest/query/operators/convert.py,sha256=aVZCo5R-nDs2n8J9h8x7JCk7WIYgx5zwldUlQZOyD3I,19905
|
|
36
|
+
palimpzest/query/operators/critique_and_refine_convert.py,sha256=nrNUX9rtZRQ40XdUXbqxS5_R3ThnpXN9d95Vy4XWGCI,5229
|
|
37
|
+
palimpzest/query/operators/filter.py,sha256=TXZAjNW9xiMV17Z4E83m5OTXT_0BOdLcfmFEbBOUqO4,10372
|
|
38
|
+
palimpzest/query/operators/limit.py,sha256=xnGC6zmHdPm_2YCtsVRBL2iwXcUB1lP_vsEkuHV4nmY,2103
|
|
39
|
+
palimpzest/query/operators/logical.py,sha256=Mx7Q12Cf2sr6Xr-PbtLlNLAVjC6lwV7WLk77pC3Gvs4,14708
|
|
40
|
+
palimpzest/query/operators/mixture_of_agents_convert.py,sha256=_RVmZ4gRzgM6bRgsSzDwU8JrGSuA1zotxKO1wRhUDhc,6628
|
|
41
|
+
palimpzest/query/operators/physical.py,sha256=2AOqj0D79-g3IVVnsyBH9g_F9NNHb1j-iOhogIEx8eY,8346
|
|
42
|
+
palimpzest/query/operators/project.py,sha256=djlKXCkz2b-h1phsD8tWqewcTLKBfWMgsyZ52oFN2MY,2084
|
|
43
|
+
palimpzest/query/operators/rag_convert.py,sha256=vUcmgNyrJO8KJYRbs15BVUNHDi8xF-n_UO5Pou9V5zc,8445
|
|
44
|
+
palimpzest/query/operators/retrieve.py,sha256=VrwtCT4Bgv4ipPmFoAgSLK93dilquKMi_hE6GKy1UNs,3972
|
|
45
|
+
palimpzest/query/operators/scan.py,sha256=z6wUVxuhr5VqPIeUxb3hxhkaljKpDc_exzGMe4NMgxY,5728
|
|
46
|
+
palimpzest/query/operators/token_reduction_convert.py,sha256=yy9GYMPt-LQxPdwIgVyhCb9hi_8FRorGU8XqK_3jq9g,8513
|
|
47
|
+
palimpzest/query/optimizer/__init__.py,sha256=pl1co0dCwDZkAQ-0oiwT81GjvB0Oc59WiwmmYF8k73s,3109
|
|
48
|
+
palimpzest/query/optimizer/cost_model.py,sha256=zSK2Nsya96pR5Zh67cr_O5q0qtPa08--Tchn0cYvE58,44837
|
|
49
|
+
palimpzest/query/optimizer/optimizer.py,sha256=lBfNYgbyyE_0bdZCGnz9oicyG3gFUdkRnzcyJ31_36o,20644
|
|
50
|
+
palimpzest/query/optimizer/optimizer_strategy.py,sha256=-1xx_cviSJw6PH8XiQQK9qe4YPnAmxZEAhNVKdxRgH4,12894
|
|
51
|
+
palimpzest/query/optimizer/plan.py,sha256=xlWB3sY5qDac3o6IHoWcuGK5Azv-4C2_zKKx4PzxEh4,5768
|
|
52
|
+
palimpzest/query/optimizer/primitives.py,sha256=ikaX8YcDM3IrxKt98OX-mYujRYQtdMlDgsFKyjchMMA,4061
|
|
53
|
+
palimpzest/query/optimizer/rules.py,sha256=jpwSI_xCzkdML4PNQScDNGzXExqont2AhQFfL4Eumdg,44059
|
|
54
|
+
palimpzest/query/optimizer/tasks.py,sha256=ORyPpAbbVAUjkxh3WyDYw2I8Z6RfQLUsLGOh5987zTI,28058
|
|
55
|
+
palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
|
+
palimpzest/query/processor/config.py,sha256=kOhBxAZ3OeDDlQ2qMII1i2EorFpSSQbEFVFrNk-3F-o,3226
|
|
57
|
+
palimpzest/query/processor/mab_sentinel_processor.py,sha256=xShPVW8ejhNR_wQ8ofPF2hL7iGj8tlFhsc2wiiIPb9Y,45600
|
|
58
|
+
palimpzest/query/processor/nosentinel_processor.py,sha256=alPq1tnZvqxCSO5LYRCjlF4CB4v7NbzH_BHB-DSuehI,26478
|
|
59
|
+
palimpzest/query/processor/query_processor.py,sha256=aR0OBmaZZt4_KSBjHy_KCT6pIBI8WTfT8TTcIkgPBt4,11109
|
|
60
|
+
palimpzest/query/processor/query_processor_factory.py,sha256=663_V-AJK0VsBZNwgnqYu84g0rmtKf-U-xJWnps3XWs,8239
|
|
61
|
+
palimpzest/query/processor/random_sampling_sentinel_processor.py,sha256=VFC0HFY3OLxAjpdmBt41K0rw8C_1ylECFjbRYsui3rU,30440
|
|
62
|
+
palimpzest/query/processor/streaming_processor.py,sha256=4-XvgAjUTnO3Dgdxm9VSw4udREjNWTW526Rggy5Do7s,6501
|
|
63
|
+
palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
palimpzest/schemabuilder/schema_builder.py,sha256=kGEv-Adba-FNziRrlG0zwx317IuD7rmzNl2GecvnbDw,8528
|
|
65
|
+
palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
|
|
66
|
+
palimpzest/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
+
palimpzest/tools/allenpdf.py,sha256=fXMOmSDdSSLXDKAPYYJ8k4egtWEBf_Me9Lq9tM3iyoA,1690
|
|
68
|
+
palimpzest/tools/pdfparser.py,sha256=0DOVUZLxYfqjxM8WNEfYcyiXb1qW9BWVIHEB_B_YhWA,9570
|
|
69
|
+
palimpzest/tools/skema_tools.py,sha256=HXUFpjMhbVxZwKKkATeK-FwtlTCawaCbeP-uHntI1Kg,669
|
|
70
|
+
palimpzest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
palimpzest/utils/datareader_helpers.py,sha256=-tkIf9iOF7mr-gyrrUQqnHWwRig4OGnowU0Wpx5HKG0,2121
|
|
72
|
+
palimpzest/utils/demo_helpers.py,sha256=BcNgtTz4O9iGriefy6f26BtJd_G5SQPzD3oQg_qLUdU,2522
|
|
73
|
+
palimpzest/utils/env_helpers.py,sha256=n81KzoJ459pRxo7QmJA7duazwWsfoMGTHc71D2LatFk,334
|
|
74
|
+
palimpzest/utils/field_helpers.py,sha256=Op18ThAnDlALiAkquUQbelHodZZYg378Ct1I8eIkKio,2291
|
|
75
|
+
palimpzest/utils/generation_helpers.py,sha256=jveE9iQQtUQ94nuU6c1zuWoQMkwizr037S8si4n35jo,3230
|
|
76
|
+
palimpzest/utils/hash_helpers.py,sha256=3A8dA7SbXTwnnvZvPVNqqMLlVRhCKyKF_bjNNAu3Exk,334
|
|
77
|
+
palimpzest/utils/index_helpers.py,sha256=7webOjV2vYF7UJ_YsNdoX5OyR1zJ6lSLWO1mQSGWz0Q,123
|
|
78
|
+
palimpzest/utils/model_helpers.py,sha256=dZdMkZ6zOBqG3uBCkmzXG1yQAoaGL3wF6lNSgnvigEQ,2399
|
|
79
|
+
palimpzest/utils/progress.py,sha256=GYmPUBdG7xmqbqj1UiSNP-pWZKmRMLX797MBgrOPugM,7214
|
|
80
|
+
palimpzest/utils/sandbox.py,sha256=Ge96gmzqeOGlNkMCG9A95_PB8wRQbvTFua136of8FcA,6465
|
|
81
|
+
palimpzest/utils/token_reduction_helpers.py,sha256=Ob95PcqCsbGLiBdQ-4YQsWGWRppb2hvQyt0gi1fzL-Y,3855
|
|
82
|
+
palimpzest/utils/udfs.py,sha256=LjHic54B1az-rKgNLur0wOpaz2ko_UodjLEJrazkxvY,1854
|
|
83
|
+
palimpzest-0.6.1.dist-info/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
|
|
84
|
+
palimpzest-0.6.1.dist-info/METADATA,sha256=VxPI4-vfq3Fm3l3PjxTpdHGbDclIQNHo1Ag1enfAyMU,7837
|
|
85
|
+
palimpzest-0.6.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
86
|
+
palimpzest-0.6.1.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
|
|
87
|
+
palimpzest-0.6.1.dist-info/RECORD,,
|
cli/README.md
DELETED
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
## Palimpzest CLI
|
|
2
|
-
Installing Palimpzest also installs its CLI tool `pz` which provides users with basic utilities for creating and managing their own Palimpzest system. Running `pz --help` diplays an overview of the CLI's commands:
|
|
3
|
-
```bash
|
|
4
|
-
$ pz --help
|
|
5
|
-
Usage: pz [OPTIONS] COMMAND [ARGS]...
|
|
6
|
-
|
|
7
|
-
The CLI tool for Palimpzest.
|
|
8
|
-
|
|
9
|
-
Options:
|
|
10
|
-
--help Show this message and exit.
|
|
11
|
-
|
|
12
|
-
Commands:
|
|
13
|
-
help (h) Print the help message for PZ.
|
|
14
|
-
init (i) Initialize data directory for PZ.
|
|
15
|
-
ls-data (ls,lsdata) Print a table listing the datasets
|
|
16
|
-
registered with PZ.
|
|
17
|
-
register-data (r,reg,register) Register a data file or data directory with
|
|
18
|
-
PZ.
|
|
19
|
-
rm-data (rm,rmdata) Remove a dataset that was registered with
|
|
20
|
-
PZ.
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
Users can initialize their own system by running `pz init`. This will create Palimpzest's working directory in `~/.palimpzest`:
|
|
24
|
-
```bash
|
|
25
|
-
$ pz init
|
|
26
|
-
Palimpzest system initialized in: /Users/matthewrusso/.palimpzest
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
If we list the set of datasets registered with Palimpzest, we'll see there currently are none:
|
|
30
|
-
```bash
|
|
31
|
-
$ pz ls
|
|
32
|
-
+------+------+------+
|
|
33
|
-
| Name | Type | Path |
|
|
34
|
-
+------+------+------+
|
|
35
|
-
+------+------+------+
|
|
36
|
-
|
|
37
|
-
Total datasets: 0
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
### Registering Datasets
|
|
41
|
-
To add (or "register") a dataset with Palimpzest, we can use the `pz register-data` command (also aliased as `pz reg`) to specify that a file or directory at a given `--path` should be registered as a dataset with the specified `--name`:
|
|
42
|
-
```bash
|
|
43
|
-
$ pz reg --path README.md --name rdme
|
|
44
|
-
Registered rdme
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
If we list Palimpzest's datasets again we will see that `README.md` has been registered under the dataset named `rdme`:
|
|
48
|
-
```bash
|
|
49
|
-
$ pz ls
|
|
50
|
-
+------+------+------------------------------------------+
|
|
51
|
-
| Name | Type | Path |
|
|
52
|
-
+------+------+------------------------------------------+
|
|
53
|
-
| rdme | file | /Users/matthewrusso/palimpzest/README.md |
|
|
54
|
-
+------+------+------------------------------------------+
|
|
55
|
-
|
|
56
|
-
Total datasets: 1
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
To remove a dataset from Palimpzest, simply use the `pz rm-data` command (also aliased as `pz rm`) and specify the `--name` of the dataset you would like to remove:
|
|
60
|
-
```bash
|
|
61
|
-
$ pz rm --name rdme
|
|
62
|
-
Deleted rdme
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
Finally, listing our datasets once more will show that the dataset has been deleted:
|
|
66
|
-
```bash
|
|
67
|
-
$ pz ls
|
|
68
|
-
+------+------+------+
|
|
69
|
-
| Name | Type | Path |
|
|
70
|
-
+------+------+------+
|
|
71
|
-
+------+------+------+
|
|
72
|
-
|
|
73
|
-
Total datasets: 0
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
### Cache Management
|
|
77
|
-
Palimpzest will cache intermediate results by default. It can be useful to remove them from the cache when trying to evaluate the performance improvement(s) of code changes. We provide a utility command `pz clear-cache` (also aliased as `pz clr`) to clear the cache:
|
|
78
|
-
```bash
|
|
79
|
-
$ pz clr
|
|
80
|
-
Cache cleared
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
### Config Management
|
|
84
|
-
You may wish to work with multiple configurations of Palimpzest in order to, e.g., evaluate the difference in performance between various LLM services for your data extraction task. To see the config Palimpzest is currently using, you can run the `pz print-config` command (also aliased as `pz config`):
|
|
85
|
-
```bash
|
|
86
|
-
$ pz config
|
|
87
|
-
--- default ---
|
|
88
|
-
filecachedir: /some/local/filepath
|
|
89
|
-
llmservice: openai
|
|
90
|
-
name: default
|
|
91
|
-
parallel: false
|
|
92
|
-
```
|
|
93
|
-
By default, Palimpzest uses the configuration named `default`. As shown above, if you run a script using Palimpzest out-of-the-box, it will use OpenAI endpoints for all of its API calls.
|
|
94
|
-
|
|
95
|
-
Now, let's say you wanted to try using [together.ai's](https://www.together.ai/) for your API calls, you could do this by creating a new config with the `pz create-config` command (also aliased as `pz cc`):
|
|
96
|
-
```bash
|
|
97
|
-
$ pz cc --name together-conf --llmservice together --parallel True --set
|
|
98
|
-
Created and set config: together-conf
|
|
99
|
-
```
|
|
100
|
-
The `--name` parameter is required and specifies the unique name for your config. The `--llmservice` and `--parallel` options specify the service to use and whether or not to process files in parallel. Finally, if the `--set` flag is present, Palimpzest will update its current config to point to the newly created config.
|
|
101
|
-
|
|
102
|
-
We can confirm that Palimpzest checked out our new config by running `pz config`:
|
|
103
|
-
```bash
|
|
104
|
-
$ pz config
|
|
105
|
-
--- together-conf ---
|
|
106
|
-
filecachedir: /some/local/filepath
|
|
107
|
-
llmservice: together
|
|
108
|
-
name: together-conf
|
|
109
|
-
parallel: true
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
You can switch which config you are using at any time by using the `pz set-config` command (also aliased as `pz set`):
|
|
113
|
-
```bash
|
|
114
|
-
$ pz set --name default
|
|
115
|
-
Set config: default
|
|
116
|
-
|
|
117
|
-
$ pz config
|
|
118
|
-
--- default ---
|
|
119
|
-
filecachedir: /some/local/filepath
|
|
120
|
-
llmservice: openai
|
|
121
|
-
name: default
|
|
122
|
-
parallel: false
|
|
123
|
-
|
|
124
|
-
$ pz set --name together-conf
|
|
125
|
-
Set config: together-conf
|
|
126
|
-
|
|
127
|
-
$ pz config
|
|
128
|
-
--- together-conf ---
|
|
129
|
-
filecachedir: /some/local/filepath
|
|
130
|
-
llmservice: together
|
|
131
|
-
name: together-conf
|
|
132
|
-
parallel: true
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
You can update an existing config using the `pz update` command (also aliased as `pz uc`):
|
|
136
|
-
```bash
|
|
137
|
-
$ pz update --name default --settings parallel=true,pdfprocessor=pdfplumber
|
|
138
|
-
Updated config: default
|
|
139
|
-
|
|
140
|
-
$ pz config
|
|
141
|
-
--- default ---
|
|
142
|
-
filecachedir: /some/local/filepath
|
|
143
|
-
llmservice: anthropic
|
|
144
|
-
name: default
|
|
145
|
-
parallel: true
|
|
146
|
-
pdfprocessor: pdfplumber
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
The `--name` parameter specifies which config to update. `--settings` specifies all the parameter name and value pairs in the format `param_name=param_value`, separated by commas.
|
|
150
|
-
|
|
151
|
-
Finally, you can delete a config with the `pz rm-config` command (also aliased as `pz rmc`):
|
|
152
|
-
```bash
|
|
153
|
-
$ pz rmc --name together-conf
|
|
154
|
-
Deleted config: together-conf
|
|
155
|
-
```
|
|
156
|
-
Note that you cannot delete the `default` config, and if you delete the config that you currently have set, Palimpzest will set the current config to be `default`.
|
cli/__init__.py
DELETED
|
File without changes
|