palimpzest 0.9.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {palimpzest-0.9.0/src/palimpzest.egg-info → palimpzest-1.0.0}/PKG-INFO +26 -66
  2. {palimpzest-0.9.0 → palimpzest-1.0.0}/README.md +25 -65
  3. {palimpzest-0.9.0 → palimpzest-1.0.0}/pyproject.toml +2 -2
  4. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/constants.py +1 -0
  5. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/dataset.py +33 -5
  6. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/elements/groupbysig.py +5 -1
  7. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/elements/records.py +16 -7
  8. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/lib/schemas.py +20 -3
  9. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/models.py +4 -4
  10. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  11. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/execution_strategy.py +8 -8
  12. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/mab_execution_strategy.py +30 -11
  13. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/parallel_execution_strategy.py +31 -7
  14. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
  15. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/__init__.py +7 -6
  16. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/aggregate.py +110 -5
  17. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/convert.py +1 -1
  18. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/join.py +279 -23
  19. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/logical.py +20 -8
  20. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/mixture_of_agents.py +3 -1
  21. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/physical.py +5 -2
  22. palimpzest-0.9.0/src/palimpzest/query/operators/retrieve.py → palimpzest-1.0.0/src/palimpzest/query/operators/topk.py +10 -10
  23. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/__init__.py +7 -3
  24. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/cost_model.py +5 -5
  25. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer.py +3 -2
  26. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/plan.py +2 -3
  27. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/rules.py +31 -11
  28. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/tasks.py +4 -4
  29. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/progress.py +19 -17
  30. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/validator/validator.py +7 -7
  31. {palimpzest-0.9.0 → palimpzest-1.0.0/src/palimpzest.egg-info}/PKG-INFO +26 -66
  32. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest.egg-info/SOURCES.txt +1 -1
  33. {palimpzest-0.9.0 → palimpzest-1.0.0}/LICENSE +0 -0
  34. {palimpzest-0.9.0 → palimpzest-1.0.0}/setup.cfg +0 -0
  35. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/__init__.py +0 -0
  36. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/agents/__init__.py +0 -0
  37. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/agents/compute_agents.py +0 -0
  38. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/agents/search_agents.py +0 -0
  39. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/__init__.py +0 -0
  40. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/__init__.py +0 -0
  41. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/context.py +0 -0
  42. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/context_manager.py +0 -0
  43. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/index_dataset.py +0 -0
  44. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/data/iter_dataset.py +0 -0
  45. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/elements/__init__.py +0 -0
  46. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/elements/filters.py +0 -0
  47. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/core/lib/__init__.py +0 -0
  48. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/policy.py +0 -0
  49. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/__init__.py +0 -0
  50. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/agent_prompts.py +0 -0
  51. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/aggregate_prompts.py +0 -0
  52. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/context_search.py +0 -0
  53. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/convert_prompts.py +0 -0
  54. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/critique_and_refine_prompts.py +0 -0
  55. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/filter_prompts.py +0 -0
  56. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/join_prompts.py +0 -0
  57. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/moa_aggregator_prompts.py +0 -0
  58. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/moa_proposer_prompts.py +0 -0
  59. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/prompt_factory.py +0 -0
  60. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/split_merge_prompts.py +0 -0
  61. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/split_proposer_prompts.py +0 -0
  62. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/utils.py +0 -0
  63. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/prompts/validator.py +0 -0
  64. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/__init__.py +0 -0
  65. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/__init__.py +0 -0
  66. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
  67. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/generators/__init__.py +0 -0
  68. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/generators/generators.py +0 -0
  69. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/compute.py +0 -0
  70. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/critique_and_refine.py +0 -0
  71. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/distinct.py +0 -0
  72. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/filter.py +0 -0
  73. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/limit.py +0 -0
  74. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/project.py +0 -0
  75. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/rag.py +0 -0
  76. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/scan.py +0 -0
  77. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/search.py +0 -0
  78. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/operators/split.py +0 -0
  79. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
  80. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
  81. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/optimizer/primitives.py +0 -0
  82. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/processor/__init__.py +0 -0
  83. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/processor/config.py +0 -0
  84. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/processor/query_processor.py +0 -0
  85. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/query/processor/query_processor_factory.py +0 -0
  86. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/schemabuilder/__init__.py +0 -0
  87. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
  88. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/tools/README.md +0 -0
  89. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/tools/__init__.py +0 -0
  90. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/tools/allenpdf.py +0 -0
  91. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/tools/pdfparser.py +0 -0
  92. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/tools/skema_tools.py +0 -0
  93. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/__init__.py +0 -0
  94. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/env_helpers.py +0 -0
  95. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/hash_helpers.py +0 -0
  96. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/model_helpers.py +0 -0
  97. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/utils/udfs.py +0 -0
  98. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest/validator/__init__.py +0 -0
  99. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest.egg-info/dependency_links.txt +0 -0
  100. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest.egg-info/requires.txt +0 -0
  101. {palimpzest-0.9.0 → palimpzest-1.0.0}/src/palimpzest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.9.0
3
+ Version: 1.0.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.8
15
- Requires-Python: >=3.10
15
+ Requires-Python: >=3.12
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: anthropic>=0.55.0
@@ -59,15 +59,20 @@ Dynamic: license-file
59
59
  <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
60
60
  <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
61
61
 
62
- ## Learn How to Use PZ
63
- Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
62
+ ## 📚 Learn How to Use PZ
63
+ Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
64
64
 
65
- ## Getting started
65
+ ## 🚀 Getting started
66
66
  You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
67
67
  ```bash
68
68
  $ pip install palimpzest
69
69
  ```
70
70
 
71
+ You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
72
+ ```bash
73
+ $ uv pip install palimpzest
74
+ ```
75
+
71
76
  Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
72
77
  ```bash
73
78
  $ git clone git@github.com:mitdbg/palimpzest.git
@@ -75,7 +80,7 @@ $ cd palimpzest
75
80
  $ pip install .
76
81
  ```
77
82
 
78
- ## Join the PZ Community
83
+ ## 🙋🏽 Join the PZ Community
79
84
  We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
80
85
 
81
86
  [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
86
91
 
87
92
  We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
88
93
 
89
- ## Quick Start
90
- The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
91
- To run the notebook, you can use the following command:
92
- ```bash
93
- $ jupyter notebook
94
- ```
95
- And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
96
-
97
- ### Even Quicker Start
98
- For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
99
- ```python
100
- import palimpzest as pz
101
-
102
- # define the fields we wish to compute
103
- email_cols = [
104
- {"name": "sender", "type": str, "desc": "The email address of the sender"},
105
- {"name": "subject", "type": str, "desc": "The subject of the email"},
106
- {"name": "date", "type": str, "desc": "The date the email was sent"},
107
- ]
108
-
109
- # lazily construct the computation to get emails about holidays sent in July
110
- dataset = pz.Dataset("testdata/enron-tiny/")
111
- dataset = dataset.sem_add_columns(email_cols)
112
- dataset = dataset.sem_filter("The email was sent in July")
113
- dataset = dataset.sem_filter("The email is about holidays")
114
-
115
- # execute the computation w/the MinCost policy
116
- config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
117
- output = dataset.run(config)
118
-
119
- # display output (if using Jupyter, otherwise use print(output_df))
120
- output_df = output.to_df(cols=["date", "sender", "subject"])
121
- display(output_df)
122
- ```
123
-
124
- ## Python Demos
125
- Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
126
-
127
- ### Downloading test data
128
- To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
129
- ```
130
- chmod +x testdata/download-testdata.sh
131
- ./testdata/download-testdata.sh
132
- ```
133
-
134
- ### Running the Demos
135
- Set your OpenAI (or Together.ai) api key at the command line:
136
- ```bash
137
- # set one (or both) of the following:
138
- export OPENAI_API_KEY=<your-api-key>
139
- export TOGETHER_API_KEY=<your-api-key>
140
- ```
141
-
142
- Now you can run the simple test program with:
143
- ```bash
144
- $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
145
- ```
146
-
147
- ### Citation
148
- If you would like to cite our work, please use the following citation:
94
+ ### 📓 Citation
95
+ If you would like to cite our original paper on Palimpzest, please use the following citation:
149
96
  ```
150
97
  @inproceedings{palimpzestCIDR,
151
98
  title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
154
101
  date = 2025,
155
102
  }
156
103
  ```
104
+
105
+ If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
106
+ ```
107
+ @misc{russo2025abacuscostbasedoptimizersemantic,
108
+ title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
109
+ author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
110
+ year={2025},
111
+ eprint={2505.14661},
112
+ archivePrefix={arXiv},
113
+ primaryClass={cs.DB},
114
+ url={https://arxiv.org/abs/2505.14661},
115
+ }
116
+ ```
@@ -9,15 +9,20 @@
9
9
  <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
10
10
  <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
11
11
 
12
- ## Learn How to Use PZ
13
- Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
12
+ ## 📚 Learn How to Use PZ
13
+ Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
14
14
 
15
- ## Getting started
15
+ ## 🚀 Getting started
16
16
  You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
17
17
  ```bash
18
18
  $ pip install palimpzest
19
19
  ```
20
20
 
21
+ You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
22
+ ```bash
23
+ $ uv pip install palimpzest
24
+ ```
25
+
21
26
  Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
22
27
  ```bash
23
28
  $ git clone git@github.com:mitdbg/palimpzest.git
@@ -25,7 +30,7 @@ $ cd palimpzest
25
30
  $ pip install .
26
31
  ```
27
32
 
28
- ## Join the PZ Community
33
+ ## 🙋🏽 Join the PZ Community
29
34
  We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
30
35
 
31
36
  [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
@@ -36,66 +41,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
36
41
 
37
42
  We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
38
43
 
39
- ## Quick Start
40
- The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
41
- To run the notebook, you can use the following command:
42
- ```bash
43
- $ jupyter notebook
44
- ```
45
- And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
46
-
47
- ### Even Quicker Start
48
- For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
49
- ```python
50
- import palimpzest as pz
51
-
52
- # define the fields we wish to compute
53
- email_cols = [
54
- {"name": "sender", "type": str, "desc": "The email address of the sender"},
55
- {"name": "subject", "type": str, "desc": "The subject of the email"},
56
- {"name": "date", "type": str, "desc": "The date the email was sent"},
57
- ]
58
-
59
- # lazily construct the computation to get emails about holidays sent in July
60
- dataset = pz.Dataset("testdata/enron-tiny/")
61
- dataset = dataset.sem_add_columns(email_cols)
62
- dataset = dataset.sem_filter("The email was sent in July")
63
- dataset = dataset.sem_filter("The email is about holidays")
64
-
65
- # execute the computation w/the MinCost policy
66
- config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
67
- output = dataset.run(config)
68
-
69
- # display output (if using Jupyter, otherwise use print(output_df))
70
- output_df = output.to_df(cols=["date", "sender", "subject"])
71
- display(output_df)
72
- ```
73
-
74
- ## Python Demos
75
- Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
76
-
77
- ### Downloading test data
78
- To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
79
- ```
80
- chmod +x testdata/download-testdata.sh
81
- ./testdata/download-testdata.sh
82
- ```
83
-
84
- ### Running the Demos
85
- Set your OpenAI (or Together.ai) api key at the command line:
86
- ```bash
87
- # set one (or both) of the following:
88
- export OPENAI_API_KEY=<your-api-key>
89
- export TOGETHER_API_KEY=<your-api-key>
90
- ```
91
-
92
- Now you can run the simple test program with:
93
- ```bash
94
- $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
95
- ```
96
-
97
- ### Citation
98
- If you would like to cite our work, please use the following citation:
44
+ ### 📓 Citation
45
+ If you would like to cite our original paper on Palimpzest, please use the following citation:
99
46
  ```
100
47
  @inproceedings{palimpzestCIDR,
101
48
  title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
@@ -103,4 +50,17 @@ If you would like to cite our work, please use the following citation:
103
50
  booktitle = {Proceedings of the {{Conference}} on {{Innovative Database Research}} ({{CIDR}})},
104
51
  date = 2025,
105
52
  }
106
- ```
53
+ ```
54
+
55
+ If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
56
+ ```
57
+ @misc{russo2025abacuscostbasedoptimizersemantic,
58
+ title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
59
+ author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
60
+ year={2025},
61
+ eprint={2505.14661},
62
+ archivePrefix={arXiv},
63
+ primaryClass={cs.DB},
64
+ url={https://arxiv.org/abs/2505.14661},
65
+ }
66
+ ```
@@ -1,9 +1,9 @@
1
1
  [project]
2
2
  name = "palimpzest"
3
- version = "0.9.0"
3
+ version = "1.0.0"
4
4
  description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
5
5
  readme = "README.md"
6
- requires-python = ">=3.10"
6
+ requires-python = ">=3.12"
7
7
  keywords = ["relational", "optimization", "llm", "AI programming", "extraction", "tools", "document", "search", "integration"]
8
8
  authors = [
9
9
  {name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
@@ -207,6 +207,7 @@ class Modality(str, Enum):
207
207
  class AggFunc(str, Enum):
208
208
  COUNT = "count"
209
209
  AVERAGE = "average"
210
+ SUM = "sum"
210
211
  MIN = "min"
211
212
  MAX = "max"
212
213
 
@@ -22,7 +22,7 @@ from palimpzest.query.operators.logical import (
22
22
  LimitScan,
23
23
  LogicalOperator,
24
24
  Project,
25
- RetrieveScan,
25
+ TopKScan,
26
26
  )
27
27
  from palimpzest.query.processor.config import QueryProcessorConfig
28
28
  from palimpzest.utils.hash_helpers import hash_for_serialized_dict
@@ -243,7 +243,30 @@ class Dataset:
243
243
  id=self.id,
244
244
  )
245
245
 
246
- def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
246
+ def join(self, other: Dataset, on: str | list[str], how: str = "inner") -> Dataset:
247
+ """
248
+ Perform the specified join on the specified (list of) column(s)
249
+ """
250
+ # enforce type for on
251
+ if isinstance(on, str):
252
+ on = [on]
253
+
254
+ # construct new output schema
255
+ combined_schema = union_schemas([self.schema, other.schema], join=True, on=on)
256
+
257
+ # construct logical operator
258
+ operator = JoinOp(
259
+ input_schema=combined_schema,
260
+ output_schema=combined_schema,
261
+ condition="",
262
+ on=on,
263
+ how=how,
264
+ depends_on=on,
265
+ )
266
+
267
+ return Dataset(sources=[self, other], operator=operator, schema=combined_schema)
268
+
269
+ def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None, how: str = "inner") -> Dataset:
247
270
  """
248
271
  Perform a semantic (inner) join on the specified join predicate
249
272
  """
@@ -259,6 +282,7 @@ class Dataset:
259
282
  input_schema=combined_schema,
260
283
  output_schema=combined_schema,
261
284
  condition=condition,
285
+ how=how,
262
286
  desc=desc,
263
287
  depends_on=depends_on,
264
288
  )
@@ -346,7 +370,6 @@ class Dataset:
346
370
 
347
371
  return Dataset(sources=[self], operator=operator, schema=new_output_schema)
348
372
 
349
-
350
373
  def sem_add_columns(self, cols: list[dict] | type[BaseModel],
351
374
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
352
375
  desc: str | None = None,
@@ -534,6 +557,11 @@ class Dataset:
534
557
  operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
535
558
  return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
536
559
 
560
+ def sum(self) -> Dataset:
561
+ """Apply a summation to this set"""
562
+ operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.SUM)
563
+ return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
564
+
537
565
  def min(self) -> Dataset:
538
566
  """Apply an min operator to this set"""
539
567
  operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
@@ -581,7 +609,7 @@ class Dataset:
581
609
 
582
610
  return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
583
611
 
584
- def retrieve(
612
+ def sem_topk(
585
613
  self,
586
614
  index: Collection,
587
615
  search_attr: str,
@@ -608,7 +636,7 @@ class Dataset:
608
636
  # index = index_factory(index)
609
637
 
610
638
  # construct logical operator
611
- operator = RetrieveScan(
639
+ operator = TopKScan(
612
640
  input_schema=self.schema,
613
641
  output_schema=new_output_schema,
614
642
  index=index,
@@ -6,8 +6,11 @@ from pydantic import BaseModel
6
6
 
7
7
  from palimpzest.core.lib.schemas import create_schema_from_fields
8
8
 
9
+ # TODO:
10
+ # - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
11
+ # - construct the correct output schema using the input schema and the group by and aggregation fields
12
+ # - remove/update all other references to GroupBySig in the codebase
9
13
 
10
- # TODO: need to rethink how group bys work
11
14
  # signature for a group by aggregate that applies
12
15
  # group and aggregation to an input tuple
13
16
  class GroupBySig:
@@ -50,6 +53,7 @@ class GroupBySig:
50
53
  ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
51
54
  return ops
52
55
 
56
+ # TODO: output schema needs to account for input schema types and create new output schema types
53
57
  def output_schema(self) -> type[BaseModel]:
54
58
  # the output class varies depending on the group by, so here
55
59
  # we dynamically construct this output
@@ -140,7 +140,7 @@ class DataRecord:
140
140
  def schema(self) -> type[BaseModel]:
141
141
  return type(self._data_item)
142
142
 
143
- def copy(self):
143
+ def copy(self) -> DataRecord:
144
144
  # get the set of fields to copy from the parent record
145
145
  copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
146
146
 
@@ -228,18 +228,18 @@ class DataRecord:
228
228
  @staticmethod
229
229
  def from_join_parents(
230
230
  schema: type[BaseModel],
231
- left_parent_record: DataRecord,
232
- right_parent_record: DataRecord,
231
+ left_parent_record: DataRecord | None,
232
+ right_parent_record: DataRecord | None,
233
233
  project_cols: list[str] | None = None,
234
234
  cardinality_idx: int = None,
235
235
  ) -> DataRecord:
236
236
  # get the set of fields and field descriptions to copy from the parent record(s)
237
- left_copy_field_names = (
237
+ left_copy_field_names = [] if left_parent_record is None else (
238
238
  left_parent_record.get_field_names()
239
239
  if project_cols is None
240
240
  else [col for col in project_cols if col in left_parent_record.get_field_names()]
241
241
  )
242
- right_copy_field_names = (
242
+ right_copy_field_names = [] if right_parent_record is None else (
243
243
  right_parent_record.get_field_names()
244
244
  if project_cols is None
245
245
  else [col for col in project_cols if col in right_parent_record.get_field_names()]
@@ -255,11 +255,20 @@ class DataRecord:
255
255
  new_field_name = f"{field_name}_right"
256
256
  data_item[new_field_name] = right_parent_record[field_name]
257
257
 
258
+ # for any missing fields in the schema, set them to None
259
+ for field_name in schema.model_fields:
260
+ if field_name not in data_item:
261
+ data_item[field_name] = None
262
+
258
263
  # make new record which has left and right parent record as its parents
264
+ left_parent_source_indices = [] if left_parent_record is None else list(left_parent_record._source_indices)
265
+ right_parent_source_indices = [] if right_parent_record is None else list(right_parent_record._source_indices)
266
+ left_parent_record_id = [] if left_parent_record is None else [left_parent_record._id]
267
+ right_parent_record_id = [] if right_parent_record is None else [right_parent_record._id]
259
268
  new_dr = DataRecord(
260
269
  schema(**data_item),
261
- source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
262
- parent_ids=[left_parent_record._id, right_parent_record._id],
270
+ source_indices=left_parent_source_indices + right_parent_source_indices,
271
+ parent_ids=left_parent_record_id + right_parent_record_id,
263
272
  cardinality_idx=cardinality_idx,
264
273
  )
265
274
 
@@ -142,16 +142,30 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
142
142
  return _create_pickleable_model(fields)
143
143
 
144
144
 
145
- def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
145
+ def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]:
146
146
  """Union multiple Pydantic models into a single model."""
147
+ # convert on to empty list if None
148
+ if on is None:
149
+ on = []
150
+
151
+ # build up the fields for the new schema
147
152
  fields = {}
148
153
  for model in models:
149
154
  for field_name, field in model.model_fields.items():
150
- if field_name in fields and not join:
155
+ # for non-join unions, make sure duplicate fields have the same type
156
+ if not join and field_name in fields:
151
157
  assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
152
- elif field_name in fields and join:
158
+
159
+ # for joins with "on" specified, no need to rename fields in "on"
160
+ elif join and field_name in on and field_name in fields:
161
+ continue
162
+
163
+ # otherwise, rename duplicate fields by appending _right
164
+ elif join and field_name in fields:
153
165
  while field_name in fields:
154
166
  field_name = f"{field_name}_right"
167
+
168
+ # add the field to the new schema
155
169
  fields[field_name] = (field.annotation, field)
156
170
 
157
171
  # create and return the new schema
@@ -194,6 +208,9 @@ class Average(BaseModel):
194
208
  class Count(BaseModel):
195
209
  count: int = Field(description="The count of items in the dataset")
196
210
 
211
+ class Sum(BaseModel):
212
+ sum: int = Field(description="The summation of items in the dataset")
213
+
197
214
  class Min(BaseModel):
198
215
  min: int | float = Field(description="The minimum value of some items in the dataset")
199
216
 
@@ -51,10 +51,10 @@ class GenerationStats(BaseModel):
51
51
  fn_call_duration_secs: float = 0.0
52
52
 
53
53
  # (if applicable) the total number of LLM calls made by this operator
54
- total_llm_calls: int = 0
54
+ total_llm_calls: float = 0
55
55
 
56
56
  # (if applicable) the total number of embedding LLM calls made by this operator
57
- total_embedding_llm_calls: int = 0
57
+ total_embedding_llm_calls: float = 0
58
58
 
59
59
  def __iadd__(self, other: GenerationStats) -> GenerationStats:
60
60
  # self.raw_answers.extend(other.raw_answers)
@@ -243,10 +243,10 @@ class RecordOpStats(BaseModel):
243
243
  fn_call_duration_secs: float = 0.0
244
244
 
245
245
  # (if applicable) the total number of LLM calls made by this operator
246
- total_llm_calls: int = 0
246
+ total_llm_calls: float = 0
247
247
 
248
248
  # (if applicable) the total number of embedding LLM calls made by this operator
249
- total_embedding_llm_calls: int = 0
249
+ total_embedding_llm_calls: float = 0
250
250
 
251
251
  # (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
252
252
  failed_convert: bool | None = None
@@ -225,7 +225,7 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
225
225
  dataset_id_to_source_indices = {}
226
226
  for dataset_id, dataset in train_dataset.items():
227
227
  total_num_samples = len(dataset)
228
- source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
228
+ source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
229
229
  dataset_id_to_source_indices[dataset_id] = source_indices
230
230
 
231
231
  # initialize set of physical operators for each logical operator
@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
14
14
  from palimpzest.query.operators.filter import LLMFilter
15
15
  from palimpzest.query.operators.join import JoinOp
16
16
  from palimpzest.query.operators.physical import PhysicalOperator
17
- from palimpzest.query.operators.retrieve import RetrieveOp
18
17
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
18
+ from palimpzest.query.operators.topk import TopKOp
19
19
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
20
20
  from palimpzest.utils.progress import PZSentinelProgressManager
21
21
  from palimpzest.validator.validator import Validator
@@ -123,7 +123,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
123
123
  return (
124
124
  not isinstance(op, LLMConvert)
125
125
  and not isinstance(op, LLMFilter)
126
- and not isinstance(op, RetrieveOp)
126
+ and not isinstance(op, TopKOp)
127
127
  and not isinstance(op, JoinOp)
128
128
  )
129
129
 
@@ -167,8 +167,8 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
167
167
  full_hashes.add(full_hash)
168
168
  futures.append(executor.submit(validator._score_flat_map, op, fields, input_record, output, full_hash))
169
169
 
170
- # create future for retrieve
171
- elif isinstance(op, RetrieveOp):
170
+ # create future for top-k
171
+ elif isinstance(op, TopKOp):
172
172
  fields = op.generated_fields
173
173
  input_record: DataRecord = record_set.input
174
174
  output = record_set.data_records[0].to_dict(project_cols=fields)
@@ -176,7 +176,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
176
176
  full_hash = f"{hash(input_record)}{hash(output_str)}"
177
177
  if full_hash not in full_hashes:
178
178
  full_hashes.add(full_hash)
179
- futures.append(executor.submit(validator._score_retrieve, op, fields, input_record, output, full_hash))
179
+ futures.append(executor.submit(validator._score_topk, op, fields, input_record, output, full_hash))
180
180
 
181
181
  # create future for filter
182
182
  elif isinstance(op, LLMFilter):
@@ -235,7 +235,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
235
235
 
236
236
  # TODO: this scoring function will (likely) bias towards small values of k since it
237
237
  # measures precision and not recall / F1; will need to revisit this in the future
238
- elif isinstance(op, RetrieveOp):
238
+ elif isinstance(op, TopKOp):
239
239
  fields = op.generated_fields
240
240
  input_record: DataRecord = record_set.input
241
241
  output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
@@ -341,9 +341,9 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
341
341
  def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
342
342
  is_llm_convert = isinstance(physical_op, LLMConvert)
343
343
  is_llm_filter = isinstance(physical_op, LLMFilter)
344
- is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
344
+ is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
345
345
  is_llm_join = isinstance(physical_op, JoinOp)
346
- return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
346
+ return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
347
347
 
348
348
  @abstractmethod
349
349
  def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator) -> SentinelPlanStats: