palimpzest 0.8.7__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {palimpzest-0.8.7/src/palimpzest.egg-info → palimpzest-1.0.0}/PKG-INFO +26 -66
- {palimpzest-0.8.7 → palimpzest-1.0.0}/README.md +25 -65
- {palimpzest-0.8.7 → palimpzest-1.0.0}/pyproject.toml +2 -2
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/constants.py +13 -4
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/dataset.py +75 -5
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/elements/groupbysig.py +5 -1
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/elements/records.py +16 -7
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/lib/schemas.py +26 -3
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/models.py +4 -4
- palimpzest-1.0.0/src/palimpzest/prompts/aggregate_prompts.py +99 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/prompt_factory.py +162 -75
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/utils.py +38 -1
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/validator.py +24 -24
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/execution_strategy.py +8 -8
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/mab_execution_strategy.py +30 -11
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/parallel_execution_strategy.py +31 -7
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/generators/generators.py +9 -7
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/__init__.py +10 -6
- palimpzest-1.0.0/src/palimpzest/query/operators/aggregate.py +666 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/convert.py +1 -1
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/join.py +279 -23
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/logical.py +36 -11
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/mixture_of_agents.py +3 -1
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/physical.py +5 -2
- palimpzest-0.8.7/src/palimpzest/query/operators/retrieve.py → palimpzest-1.0.0/src/palimpzest/query/operators/topk.py +10 -10
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/__init__.py +11 -3
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/cost_model.py +5 -5
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer.py +3 -2
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/plan.py +2 -3
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/rules.py +73 -13
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/tasks.py +4 -4
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/progress.py +19 -17
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/validator/validator.py +7 -7
- {palimpzest-0.8.7 → palimpzest-1.0.0/src/palimpzest.egg-info}/PKG-INFO +26 -66
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest.egg-info/SOURCES.txt +2 -1
- palimpzest-0.8.7/src/palimpzest/query/operators/aggregate.py +0 -282
- {palimpzest-0.8.7 → palimpzest-1.0.0}/LICENSE +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/setup.cfg +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/agents/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/agents/compute_agents.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/agents/search_agents.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/context.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/context_manager.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/index_dataset.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/data/iter_dataset.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/elements/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/elements/filters.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/core/lib/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/policy.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/agent_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/context_search.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/convert_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/critique_and_refine_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/filter_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/join_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/moa_aggregator_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/moa_proposer_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/split_merge_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/prompts/split_proposer_prompts.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/generators/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/compute.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/critique_and_refine.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/distinct.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/filter.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/limit.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/project.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/rag.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/scan.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/search.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/operators/split.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/primitives.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/processor/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/processor/config.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/processor/query_processor.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/query/processor/query_processor_factory.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/schemabuilder/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/tools/README.md +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/tools/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/tools/allenpdf.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/tools/pdfparser.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/tools/skema_tools.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/env_helpers.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/hash_helpers.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/model_helpers.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/utils/udfs.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest/validator/__init__.py +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest.egg-info/dependency_links.txt +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest.egg-info/requires.txt +0 -0
- {palimpzest-0.8.7 → palimpzest-1.0.0}/src/palimpzest.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
|
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: anthropic>=0.55.0
|
|
@@ -59,15 +59,20 @@ Dynamic: license-file
|
|
|
59
59
|
<!-- [](https://arxiv.org/pdf/2405.14696) -->
|
|
60
60
|
<!-- [](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
|
|
61
61
|
|
|
62
|
-
## Learn How to Use PZ
|
|
63
|
-
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
|
|
62
|
+
## 📚 Learn How to Use PZ
|
|
63
|
+
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
|
|
64
64
|
|
|
65
|
-
## Getting started
|
|
65
|
+
## 🚀 Getting started
|
|
66
66
|
You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
|
|
67
67
|
```bash
|
|
68
68
|
$ pip install palimpzest
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
|
|
72
|
+
```bash
|
|
73
|
+
$ uv pip install palimpzest
|
|
74
|
+
```
|
|
75
|
+
|
|
71
76
|
Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
|
|
72
77
|
```bash
|
|
73
78
|
$ git clone git@github.com:mitdbg/palimpzest.git
|
|
@@ -75,7 +80,7 @@ $ cd palimpzest
|
|
|
75
80
|
$ pip install .
|
|
76
81
|
```
|
|
77
82
|
|
|
78
|
-
## Join the PZ Community
|
|
83
|
+
## 🙋🏽 Join the PZ Community
|
|
79
84
|
We are actively hacking on PZ and would love to have you join our community [](https://discord.gg/dN85JJ6jaH)
|
|
80
85
|
|
|
81
86
|
[Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
|
|
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
|
|
|
86
91
|
|
|
87
92
|
We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
|
|
88
93
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
To run the notebook, you can use the following command:
|
|
92
|
-
```bash
|
|
93
|
-
$ jupyter notebook
|
|
94
|
-
```
|
|
95
|
-
And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
|
|
96
|
-
|
|
97
|
-
### Even Quicker Start
|
|
98
|
-
For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
|
|
99
|
-
```python
|
|
100
|
-
import palimpzest as pz
|
|
101
|
-
|
|
102
|
-
# define the fields we wish to compute
|
|
103
|
-
email_cols = [
|
|
104
|
-
{"name": "sender", "type": str, "desc": "The email address of the sender"},
|
|
105
|
-
{"name": "subject", "type": str, "desc": "The subject of the email"},
|
|
106
|
-
{"name": "date", "type": str, "desc": "The date the email was sent"},
|
|
107
|
-
]
|
|
108
|
-
|
|
109
|
-
# lazily construct the computation to get emails about holidays sent in July
|
|
110
|
-
dataset = pz.Dataset("testdata/enron-tiny/")
|
|
111
|
-
dataset = dataset.sem_add_columns(email_cols)
|
|
112
|
-
dataset = dataset.sem_filter("The email was sent in July")
|
|
113
|
-
dataset = dataset.sem_filter("The email is about holidays")
|
|
114
|
-
|
|
115
|
-
# execute the computation w/the MinCost policy
|
|
116
|
-
config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
|
|
117
|
-
output = dataset.run(config)
|
|
118
|
-
|
|
119
|
-
# display output (if using Jupyter, otherwise use print(output_df))
|
|
120
|
-
output_df = output.to_df(cols=["date", "sender", "subject"])
|
|
121
|
-
display(output_df)
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
## Python Demos
|
|
125
|
-
Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
|
|
126
|
-
|
|
127
|
-
### Downloading test data
|
|
128
|
-
To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
|
|
129
|
-
```
|
|
130
|
-
chmod +x testdata/download-testdata.sh
|
|
131
|
-
./testdata/download-testdata.sh
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
### Running the Demos
|
|
135
|
-
Set your OpenAI (or Together.ai) api key at the command line:
|
|
136
|
-
```bash
|
|
137
|
-
# set one (or both) of the following:
|
|
138
|
-
export OPENAI_API_KEY=<your-api-key>
|
|
139
|
-
export TOGETHER_API_KEY=<your-api-key>
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
Now you can run the simple test program with:
|
|
143
|
-
```bash
|
|
144
|
-
$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
### Citation
|
|
148
|
-
If you would like to cite our work, please use the following citation:
|
|
94
|
+
### 📓 Citation
|
|
95
|
+
If you would like to cite our original paper on Palimpzest, please use the following citation:
|
|
149
96
|
```
|
|
150
97
|
@inproceedings{palimpzestCIDR,
|
|
151
98
|
title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
|
|
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
|
|
|
154
101
|
date = 2025,
|
|
155
102
|
}
|
|
156
103
|
```
|
|
104
|
+
|
|
105
|
+
If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
|
|
106
|
+
```
|
|
107
|
+
@misc{russo2025abacuscostbasedoptimizersemantic,
|
|
108
|
+
title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
|
|
109
|
+
author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
|
|
110
|
+
year={2025},
|
|
111
|
+
eprint={2505.14661},
|
|
112
|
+
archivePrefix={arXiv},
|
|
113
|
+
primaryClass={cs.DB},
|
|
114
|
+
url={https://arxiv.org/abs/2505.14661},
|
|
115
|
+
}
|
|
116
|
+
```
|
|
@@ -9,15 +9,20 @@
|
|
|
9
9
|
<!-- [](https://arxiv.org/pdf/2405.14696) -->
|
|
10
10
|
<!-- [](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
|
|
11
11
|
|
|
12
|
-
## Learn How to Use PZ
|
|
13
|
-
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
|
|
12
|
+
## 📚 Learn How to Use PZ
|
|
13
|
+
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
|
|
14
14
|
|
|
15
|
-
## Getting started
|
|
15
|
+
## 🚀 Getting started
|
|
16
16
|
You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
|
|
17
17
|
```bash
|
|
18
18
|
$ pip install palimpzest
|
|
19
19
|
```
|
|
20
20
|
|
|
21
|
+
You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
|
|
22
|
+
```bash
|
|
23
|
+
$ uv pip install palimpzest
|
|
24
|
+
```
|
|
25
|
+
|
|
21
26
|
Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
|
|
22
27
|
```bash
|
|
23
28
|
$ git clone git@github.com:mitdbg/palimpzest.git
|
|
@@ -25,7 +30,7 @@ $ cd palimpzest
|
|
|
25
30
|
$ pip install .
|
|
26
31
|
```
|
|
27
32
|
|
|
28
|
-
## Join the PZ Community
|
|
33
|
+
## 🙋🏽 Join the PZ Community
|
|
29
34
|
We are actively hacking on PZ and would love to have you join our community [](https://discord.gg/dN85JJ6jaH)
|
|
30
35
|
|
|
31
36
|
[Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
|
|
@@ -36,66 +41,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
|
|
|
36
41
|
|
|
37
42
|
We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
To run the notebook, you can use the following command:
|
|
42
|
-
```bash
|
|
43
|
-
$ jupyter notebook
|
|
44
|
-
```
|
|
45
|
-
And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
|
|
46
|
-
|
|
47
|
-
### Even Quicker Start
|
|
48
|
-
For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
|
|
49
|
-
```python
|
|
50
|
-
import palimpzest as pz
|
|
51
|
-
|
|
52
|
-
# define the fields we wish to compute
|
|
53
|
-
email_cols = [
|
|
54
|
-
{"name": "sender", "type": str, "desc": "The email address of the sender"},
|
|
55
|
-
{"name": "subject", "type": str, "desc": "The subject of the email"},
|
|
56
|
-
{"name": "date", "type": str, "desc": "The date the email was sent"},
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
# lazily construct the computation to get emails about holidays sent in July
|
|
60
|
-
dataset = pz.Dataset("testdata/enron-tiny/")
|
|
61
|
-
dataset = dataset.sem_add_columns(email_cols)
|
|
62
|
-
dataset = dataset.sem_filter("The email was sent in July")
|
|
63
|
-
dataset = dataset.sem_filter("The email is about holidays")
|
|
64
|
-
|
|
65
|
-
# execute the computation w/the MinCost policy
|
|
66
|
-
config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
|
|
67
|
-
output = dataset.run(config)
|
|
68
|
-
|
|
69
|
-
# display output (if using Jupyter, otherwise use print(output_df))
|
|
70
|
-
output_df = output.to_df(cols=["date", "sender", "subject"])
|
|
71
|
-
display(output_df)
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
## Python Demos
|
|
75
|
-
Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
|
|
76
|
-
|
|
77
|
-
### Downloading test data
|
|
78
|
-
To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
|
|
79
|
-
```
|
|
80
|
-
chmod +x testdata/download-testdata.sh
|
|
81
|
-
./testdata/download-testdata.sh
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
### Running the Demos
|
|
85
|
-
Set your OpenAI (or Together.ai) api key at the command line:
|
|
86
|
-
```bash
|
|
87
|
-
# set one (or both) of the following:
|
|
88
|
-
export OPENAI_API_KEY=<your-api-key>
|
|
89
|
-
export TOGETHER_API_KEY=<your-api-key>
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
Now you can run the simple test program with:
|
|
93
|
-
```bash
|
|
94
|
-
$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
### Citation
|
|
98
|
-
If you would like to cite our work, please use the following citation:
|
|
44
|
+
### 📓 Citation
|
|
45
|
+
If you would like to cite our original paper on Palimpzest, please use the following citation:
|
|
99
46
|
```
|
|
100
47
|
@inproceedings{palimpzestCIDR,
|
|
101
48
|
title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
|
|
@@ -103,4 +50,17 @@ If you would like to cite our work, please use the following citation:
|
|
|
103
50
|
booktitle = {Proceedings of the {{Conference}} on {{Innovative Database Research}} ({{CIDR}})},
|
|
104
51
|
date = 2025,
|
|
105
52
|
}
|
|
106
|
-
```
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
|
|
56
|
+
```
|
|
57
|
+
@misc{russo2025abacuscostbasedoptimizersemantic,
|
|
58
|
+
title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
|
|
59
|
+
author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
|
|
60
|
+
year={2025},
|
|
61
|
+
eprint={2505.14661},
|
|
62
|
+
archivePrefix={arXiv},
|
|
63
|
+
primaryClass={cs.DB},
|
|
64
|
+
url={https://arxiv.org/abs/2505.14661},
|
|
65
|
+
}
|
|
66
|
+
```
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "palimpzest"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "1.0.0"
|
|
4
4
|
description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
|
|
5
5
|
readme = "README.md"
|
|
6
|
-
requires-python = ">=3.
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
7
|
keywords = ["relational", "optimization", "llm", "AI programming", "extraction", "tools", "document", "search", "integration"]
|
|
8
8
|
authors = [
|
|
9
9
|
{name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
|
|
@@ -136,13 +136,17 @@ class PromptStrategy(str, Enum):
|
|
|
136
136
|
performing some task with a specified Model.
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
|
+
# aggregation prompt strategies
|
|
140
|
+
AGG = "aggregation"
|
|
141
|
+
AGG_NO_REASONING = "aggregation-no-reasoning"
|
|
142
|
+
|
|
139
143
|
# filter prompt strategies
|
|
140
144
|
FILTER = "filter"
|
|
141
145
|
FILTER_NO_REASONING = "filter-no-reasoning"
|
|
142
146
|
FILTER_CRITIC = "filter-critic"
|
|
143
147
|
FILTER_REFINE = "filter-refine"
|
|
144
148
|
FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
|
|
145
|
-
FILTER_MOA_AGG = "filter-mixture-of-agents-
|
|
149
|
+
FILTER_MOA_AGG = "filter-mixture-of-agents-aggregator"
|
|
146
150
|
FILTER_SPLIT_PROPOSER = "filter-split-proposer"
|
|
147
151
|
FILTER_SPLIT_MERGER = "filter-split-merger"
|
|
148
152
|
|
|
@@ -156,10 +160,13 @@ class PromptStrategy(str, Enum):
|
|
|
156
160
|
MAP_CRITIC = "map-critic"
|
|
157
161
|
MAP_REFINE = "map-refine"
|
|
158
162
|
MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
|
|
159
|
-
MAP_MOA_AGG = "map-mixture-of-agents-
|
|
163
|
+
MAP_MOA_AGG = "map-mixture-of-agents-aggregator"
|
|
160
164
|
MAP_SPLIT_PROPOSER = "map-split-proposer"
|
|
161
165
|
MAP_SPLIT_MERGER = "map-split-merger"
|
|
162
166
|
|
|
167
|
+
def is_agg_prompt(self):
|
|
168
|
+
return "aggregation" in self.value
|
|
169
|
+
|
|
163
170
|
def is_filter_prompt(self):
|
|
164
171
|
return "filter" in self.value
|
|
165
172
|
|
|
@@ -179,7 +186,7 @@ class PromptStrategy(str, Enum):
|
|
|
179
186
|
return "mixture-of-agents-proposer" in self.value
|
|
180
187
|
|
|
181
188
|
def is_moa_aggregator_prompt(self):
|
|
182
|
-
return "mixture-of-agents-
|
|
189
|
+
return "mixture-of-agents-aggregator" in self.value
|
|
183
190
|
|
|
184
191
|
def is_split_proposer_prompt(self):
|
|
185
192
|
return "split-proposer" in self.value
|
|
@@ -200,7 +207,9 @@ class Modality(str, Enum):
|
|
|
200
207
|
class AggFunc(str, Enum):
|
|
201
208
|
COUNT = "count"
|
|
202
209
|
AVERAGE = "average"
|
|
203
|
-
|
|
210
|
+
SUM = "sum"
|
|
211
|
+
MIN = "min"
|
|
212
|
+
MAX = "max"
|
|
204
213
|
|
|
205
214
|
class Cardinality(str, Enum):
|
|
206
215
|
ONE_TO_ONE = "one-to-one"
|
|
@@ -22,7 +22,7 @@ from palimpzest.query.operators.logical import (
|
|
|
22
22
|
LimitScan,
|
|
23
23
|
LogicalOperator,
|
|
24
24
|
Project,
|
|
25
|
-
|
|
25
|
+
TopKScan,
|
|
26
26
|
)
|
|
27
27
|
from palimpzest.query.processor.config import QueryProcessorConfig
|
|
28
28
|
from palimpzest.utils.hash_helpers import hash_for_serialized_dict
|
|
@@ -243,7 +243,30 @@ class Dataset:
|
|
|
243
243
|
id=self.id,
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
def
|
|
246
|
+
def join(self, other: Dataset, on: str | list[str], how: str = "inner") -> Dataset:
|
|
247
|
+
"""
|
|
248
|
+
Perform the specified join on the specified (list of) column(s)
|
|
249
|
+
"""
|
|
250
|
+
# enforce type for on
|
|
251
|
+
if isinstance(on, str):
|
|
252
|
+
on = [on]
|
|
253
|
+
|
|
254
|
+
# construct new output schema
|
|
255
|
+
combined_schema = union_schemas([self.schema, other.schema], join=True, on=on)
|
|
256
|
+
|
|
257
|
+
# construct logical operator
|
|
258
|
+
operator = JoinOp(
|
|
259
|
+
input_schema=combined_schema,
|
|
260
|
+
output_schema=combined_schema,
|
|
261
|
+
condition="",
|
|
262
|
+
on=on,
|
|
263
|
+
how=how,
|
|
264
|
+
depends_on=on,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return Dataset(sources=[self, other], operator=operator, schema=combined_schema)
|
|
268
|
+
|
|
269
|
+
def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None, how: str = "inner") -> Dataset:
|
|
247
270
|
"""
|
|
248
271
|
Perform a semantic (inner) join on the specified join predicate
|
|
249
272
|
"""
|
|
@@ -259,6 +282,7 @@ class Dataset:
|
|
|
259
282
|
input_schema=combined_schema,
|
|
260
283
|
output_schema=combined_schema,
|
|
261
284
|
condition=condition,
|
|
285
|
+
how=how,
|
|
262
286
|
desc=desc,
|
|
263
287
|
depends_on=depends_on,
|
|
264
288
|
)
|
|
@@ -346,7 +370,6 @@ class Dataset:
|
|
|
346
370
|
|
|
347
371
|
return Dataset(sources=[self], operator=operator, schema=new_output_schema)
|
|
348
372
|
|
|
349
|
-
|
|
350
373
|
def sem_add_columns(self, cols: list[dict] | type[BaseModel],
|
|
351
374
|
cardinality: Cardinality = Cardinality.ONE_TO_ONE,
|
|
352
375
|
desc: str | None = None,
|
|
@@ -534,12 +557,59 @@ class Dataset:
|
|
|
534
557
|
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
|
|
535
558
|
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
536
559
|
|
|
560
|
+
def sum(self) -> Dataset:
|
|
561
|
+
"""Apply a summation to this set"""
|
|
562
|
+
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.SUM)
|
|
563
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
564
|
+
|
|
565
|
+
def min(self) -> Dataset:
|
|
566
|
+
"""Apply an min operator to this set"""
|
|
567
|
+
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
|
|
568
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
569
|
+
|
|
570
|
+
def max(self) -> Dataset:
|
|
571
|
+
"""Apply an max operator to this set"""
|
|
572
|
+
operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX)
|
|
573
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
574
|
+
|
|
537
575
|
def groupby(self, groupby: GroupBySig) -> Dataset:
|
|
538
576
|
output_schema = groupby.output_schema()
|
|
539
577
|
operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby)
|
|
540
578
|
return Dataset(sources=[self], operator=operator, schema=output_schema)
|
|
541
579
|
|
|
542
|
-
def
|
|
580
|
+
def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset:
|
|
581
|
+
"""
|
|
582
|
+
Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM
|
|
583
|
+
over the entire set of inputs' fields specified in `depends_on` to generate the output `col`.
|
|
584
|
+
|
|
585
|
+
Example:
|
|
586
|
+
sem_agg(
|
|
587
|
+
col={'name': 'overall_sentiment', 'desc': 'The overall sentiment of the reviews', 'type': str},
|
|
588
|
+
agg="Compute the overall sentiment of the reviews as POSITIVE or NEGATIVE.",
|
|
589
|
+
depends_on="review_text",
|
|
590
|
+
)
|
|
591
|
+
"""
|
|
592
|
+
# construct new output schema
|
|
593
|
+
new_output_schema = None
|
|
594
|
+
if isinstance(col, dict):
|
|
595
|
+
col_schema = create_schema_from_fields([col])
|
|
596
|
+
new_output_schema = union_schemas([self.schema, col_schema])
|
|
597
|
+
elif issubclass(col, BaseModel):
|
|
598
|
+
assert len(col.model_fields) == 1, "For semantic aggregation, when passing a BaseModel to `col` it must have exactly one field."
|
|
599
|
+
new_output_schema = union_schemas([self.schema, col])
|
|
600
|
+
else:
|
|
601
|
+
raise ValueError("`col` must be a dictionary or a single-field BaseModel.")
|
|
602
|
+
|
|
603
|
+
# enforce type for depends_on
|
|
604
|
+
if isinstance(depends_on, str):
|
|
605
|
+
depends_on = [depends_on]
|
|
606
|
+
|
|
607
|
+
# construct logical operator
|
|
608
|
+
operator = Aggregate(input_schema=self.schema, output_schema=new_output_schema, agg_str=agg, depends_on=depends_on)
|
|
609
|
+
|
|
610
|
+
return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
|
|
611
|
+
|
|
612
|
+
def sem_topk(
|
|
543
613
|
self,
|
|
544
614
|
index: Collection,
|
|
545
615
|
search_attr: str,
|
|
@@ -566,7 +636,7 @@ class Dataset:
|
|
|
566
636
|
# index = index_factory(index)
|
|
567
637
|
|
|
568
638
|
# construct logical operator
|
|
569
|
-
operator =
|
|
639
|
+
operator = TopKScan(
|
|
570
640
|
input_schema=self.schema,
|
|
571
641
|
output_schema=new_output_schema,
|
|
572
642
|
index=index,
|
|
@@ -6,8 +6,11 @@ from pydantic import BaseModel
|
|
|
6
6
|
|
|
7
7
|
from palimpzest.core.lib.schemas import create_schema_from_fields
|
|
8
8
|
|
|
9
|
+
# TODO:
|
|
10
|
+
# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
|
|
11
|
+
# - construct the correct output schema using the input schema and the group by and aggregation fields
|
|
12
|
+
# - remove/update all other references to GroupBySig in the codebase
|
|
9
13
|
|
|
10
|
-
# TODO: need to rethink how group bys work
|
|
11
14
|
# signature for a group by aggregate that applies
|
|
12
15
|
# group and aggregation to an input tuple
|
|
13
16
|
class GroupBySig:
|
|
@@ -50,6 +53,7 @@ class GroupBySig:
|
|
|
50
53
|
ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
|
|
51
54
|
return ops
|
|
52
55
|
|
|
56
|
+
# TODO: output schema needs to account for input schema types and create new output schema types
|
|
53
57
|
def output_schema(self) -> type[BaseModel]:
|
|
54
58
|
# the output class varies depending on the group by, so here
|
|
55
59
|
# we dynamically construct this output
|
|
@@ -140,7 +140,7 @@ class DataRecord:
|
|
|
140
140
|
def schema(self) -> type[BaseModel]:
|
|
141
141
|
return type(self._data_item)
|
|
142
142
|
|
|
143
|
-
def copy(self):
|
|
143
|
+
def copy(self) -> DataRecord:
|
|
144
144
|
# get the set of fields to copy from the parent record
|
|
145
145
|
copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
|
|
146
146
|
|
|
@@ -228,18 +228,18 @@ class DataRecord:
|
|
|
228
228
|
@staticmethod
|
|
229
229
|
def from_join_parents(
|
|
230
230
|
schema: type[BaseModel],
|
|
231
|
-
left_parent_record: DataRecord,
|
|
232
|
-
right_parent_record: DataRecord,
|
|
231
|
+
left_parent_record: DataRecord | None,
|
|
232
|
+
right_parent_record: DataRecord | None,
|
|
233
233
|
project_cols: list[str] | None = None,
|
|
234
234
|
cardinality_idx: int = None,
|
|
235
235
|
) -> DataRecord:
|
|
236
236
|
# get the set of fields and field descriptions to copy from the parent record(s)
|
|
237
|
-
left_copy_field_names = (
|
|
237
|
+
left_copy_field_names = [] if left_parent_record is None else (
|
|
238
238
|
left_parent_record.get_field_names()
|
|
239
239
|
if project_cols is None
|
|
240
240
|
else [col for col in project_cols if col in left_parent_record.get_field_names()]
|
|
241
241
|
)
|
|
242
|
-
right_copy_field_names = (
|
|
242
|
+
right_copy_field_names = [] if right_parent_record is None else (
|
|
243
243
|
right_parent_record.get_field_names()
|
|
244
244
|
if project_cols is None
|
|
245
245
|
else [col for col in project_cols if col in right_parent_record.get_field_names()]
|
|
@@ -255,11 +255,20 @@ class DataRecord:
|
|
|
255
255
|
new_field_name = f"{field_name}_right"
|
|
256
256
|
data_item[new_field_name] = right_parent_record[field_name]
|
|
257
257
|
|
|
258
|
+
# for any missing fields in the schema, set them to None
|
|
259
|
+
for field_name in schema.model_fields:
|
|
260
|
+
if field_name not in data_item:
|
|
261
|
+
data_item[field_name] = None
|
|
262
|
+
|
|
258
263
|
# make new record which has left and right parent record as its parents
|
|
264
|
+
left_parent_source_indices = [] if left_parent_record is None else list(left_parent_record._source_indices)
|
|
265
|
+
right_parent_source_indices = [] if right_parent_record is None else list(right_parent_record._source_indices)
|
|
266
|
+
left_parent_record_id = [] if left_parent_record is None else [left_parent_record._id]
|
|
267
|
+
right_parent_record_id = [] if right_parent_record is None else [right_parent_record._id]
|
|
259
268
|
new_dr = DataRecord(
|
|
260
269
|
schema(**data_item),
|
|
261
|
-
source_indices=
|
|
262
|
-
parent_ids=
|
|
270
|
+
source_indices=left_parent_source_indices + right_parent_source_indices,
|
|
271
|
+
parent_ids=left_parent_record_id + right_parent_record_id,
|
|
263
272
|
cardinality_idx=cardinality_idx,
|
|
264
273
|
)
|
|
265
274
|
|
|
@@ -142,16 +142,30 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
|
|
|
142
142
|
return _create_pickleable_model(fields)
|
|
143
143
|
|
|
144
144
|
|
|
145
|
-
def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
|
|
145
|
+
def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]:
|
|
146
146
|
"""Union multiple Pydantic models into a single model."""
|
|
147
|
+
# convert on to empty list if None
|
|
148
|
+
if on is None:
|
|
149
|
+
on = []
|
|
150
|
+
|
|
151
|
+
# build up the fields for the new schema
|
|
147
152
|
fields = {}
|
|
148
153
|
for model in models:
|
|
149
154
|
for field_name, field in model.model_fields.items():
|
|
150
|
-
|
|
155
|
+
# for non-join unions, make sure duplicate fields have the same type
|
|
156
|
+
if not join and field_name in fields:
|
|
151
157
|
assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
|
|
152
|
-
|
|
158
|
+
|
|
159
|
+
# for joins with "on" specified, no need to rename fields in "on"
|
|
160
|
+
elif join and field_name in on and field_name in fields:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# otherwise, rename duplicate fields by appending _right
|
|
164
|
+
elif join and field_name in fields:
|
|
153
165
|
while field_name in fields:
|
|
154
166
|
field_name = f"{field_name}_right"
|
|
167
|
+
|
|
168
|
+
# add the field to the new schema
|
|
155
169
|
fields[field_name] = (field.annotation, field)
|
|
156
170
|
|
|
157
171
|
# create and return the new schema
|
|
@@ -194,6 +208,15 @@ class Average(BaseModel):
|
|
|
194
208
|
class Count(BaseModel):
|
|
195
209
|
count: int = Field(description="The count of items in the dataset")
|
|
196
210
|
|
|
211
|
+
class Sum(BaseModel):
|
|
212
|
+
sum: int = Field(description="The summation of items in the dataset")
|
|
213
|
+
|
|
214
|
+
class Min(BaseModel):
|
|
215
|
+
min: int | float = Field(description="The minimum value of some items in the dataset")
|
|
216
|
+
|
|
217
|
+
class Max(BaseModel):
|
|
218
|
+
max: int | float = Field(description="The maximum value of some items in the dataset")
|
|
219
|
+
|
|
197
220
|
class OperatorDerivedSchema(BaseModel):
|
|
198
221
|
"""Schema defined by an operator, e.g., a join or a group by"""
|
|
199
222
|
|
|
@@ -51,10 +51,10 @@ class GenerationStats(BaseModel):
|
|
|
51
51
|
fn_call_duration_secs: float = 0.0
|
|
52
52
|
|
|
53
53
|
# (if applicable) the total number of LLM calls made by this operator
|
|
54
|
-
total_llm_calls:
|
|
54
|
+
total_llm_calls: float = 0
|
|
55
55
|
|
|
56
56
|
# (if applicable) the total number of embedding LLM calls made by this operator
|
|
57
|
-
total_embedding_llm_calls:
|
|
57
|
+
total_embedding_llm_calls: float = 0
|
|
58
58
|
|
|
59
59
|
def __iadd__(self, other: GenerationStats) -> GenerationStats:
|
|
60
60
|
# self.raw_answers.extend(other.raw_answers)
|
|
@@ -243,10 +243,10 @@ class RecordOpStats(BaseModel):
|
|
|
243
243
|
fn_call_duration_secs: float = 0.0
|
|
244
244
|
|
|
245
245
|
# (if applicable) the total number of LLM calls made by this operator
|
|
246
|
-
total_llm_calls:
|
|
246
|
+
total_llm_calls: float = 0
|
|
247
247
|
|
|
248
248
|
# (if applicable) the total number of embedding LLM calls made by this operator
|
|
249
|
-
total_embedding_llm_calls:
|
|
249
|
+
total_embedding_llm_calls: float = 0
|
|
250
250
|
|
|
251
251
|
# (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
|
|
252
252
|
failed_convert: bool | None = None
|