ppef 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ppef might be problematic. Click here for more details.

ppef-1.0.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Alec Heckert
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
ppef-1.0.2/MANIFEST.in ADDED
@@ -0,0 +1 @@
1
+ include include/ppef.h
ppef-1.0.2/PKG-INFO ADDED
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: ppef
3
+ Version: 1.0.2
4
+ Summary: Partitioned Elias-Fano encoding for sequences of nondecreasing integers
5
+ Author-email: Alec Heckert <alecheckert@gmail.com>
6
+ License-File: LICENSE
7
+ Requires-Dist: numpy
8
+ Dynamic: license-file
ppef-1.0.2/README.md ADDED
@@ -0,0 +1,106 @@
1
+ # ppef: Partitioned Elias-Fano encoding
2
+
3
+ Compact C++11 : Python implementation of the partitioned Elias-Fano (PEF) encoding from Ottoviano & Venturini (https://doi.org/10.1145/2600428.2609615).
4
+
5
+ Partly for fun (it's a neat method), partly because I needed a Python-facing implementation that was simple/hackable but still reasonably performant.
6
+
7
+ The main interface is a `Sequence` object that provides a compressed in-memory representation of a nondecreasing sequence of unsigned integers. Following Ottoviano & Venturini, we divide this sequence into "blocks" that are each independently encoded with Elias-Fano using adaptive high/low bit ratios. This partitioning scheme increases compression efficiency: for large sets, we're usually able to get compression ratios of >=10-20X. Apart from that, the scheme has some other benefits that we exploit here:
8
+ - $O(1)$ random access _without decompression_
9
+ - $O(n+m)$ intersections and unions _without decompression_
10
+ - $O(\log (n))$ set membership tests _without decompression_
11
+ - Trivial $O(n)$ serialization and deserialization
12
+
13
+ These properties make the `Sequence` well-suited to storing large inverted indices in search algorithms. All operations maintain sorting on the input sequence.
14
+
15
+ "Without decompression" above means without decompressing the entire `Sequence`. We still need to decompress individual partitions, but we can do all of the above while only holding a single decompressed partition in memory at a time.
16
+
17
+ Our implementation also has some other benefits:
18
+ - No external C/C++ dependencies
19
+ - Thin Python bindings
20
+ - Pickleable
21
+
22
+ Limitations include:
23
+ - No insert operation; requires full decompression
24
+ - Currently doesn't support mmap (this is a future improvement)
25
+
26
+ ## Python example
27
+
28
+ ```
29
+ import numpy as np
30
+ from ppef import Sequence, deserialize
31
+
32
+ # Sample a sequence of integers. These are uniformly distributed, which is
33
+ # a worst-case situation for Elias-Fano encoding.
34
+ values = np.random.randint(0, 1<<16, size=1<<22)
35
+ values.sort()
36
+
37
+ # Encode
38
+ seq = Sequence(values)
39
+
40
+ # Show some info
41
+ seq.info()
42
+
43
+ # Total number of compressed elements
44
+ n_elements = len(seq)
45
+ assert n_elements == len(values)
46
+
47
+ # Random access: get the i^th element without decompressing
48
+ idx = 5000
49
+ val: int = seq[idx]
50
+ assert val == values[idx]
51
+
52
+ # Set membership testing
53
+ val_is_present = val in seq
54
+ assert val_is_present
55
+
56
+ # Decode the entire sequence
57
+ values: list[int] = seq.decode()
58
+
59
+ # Decode only the 50th partition block
60
+ chunk: list[int] = seq.decode_block(50)
61
+
62
+ # Total number of partition blocks
63
+ print(seq.n_blocks)
64
+
65
+ # Serialize to a file
66
+ seq.save("myfile.ppef")
67
+
68
+ # Deserialize from a file
69
+ seq2 = Sequence("myfile.ppef")
70
+
71
+ # Serialize to a bytestring
72
+ serialized: bytes = seq.serialize()
73
+
74
+ # Deserialize from a bytestring
75
+ seq2: Sequence = deserialize(serialized)
76
+
77
+ # Define another Sequence for testing intersections and unions
78
+ values2 = np.random.randint(0, 1<<16, size=1<<22)
79
+ values2.sort()
80
+ seq2 = Sequence(values2)
81
+
82
+ # Get the intersection between two Sequences (without decompressing)
83
+ new_seq: Sequence = seq & seq2
84
+
85
+ # Get the union between two Sequences (without decompressing)
86
+ new_seq: Sequence = seq | seq2
87
+ ```
88
+
89
+ ## Building, testing
90
+
91
+ Compile the Python package:
92
+ ```
93
+ pip install .
94
+ ```
95
+
96
+ Build and run the C++ tests:
97
+ ```
98
+ cd tests
99
+ make
100
+ ./test_driver
101
+ ```
102
+
103
+ Run the Python tests:
104
+ ```
105
+ pytest tests
106
+ ```
@@ -0,0 +1,267 @@
1
+ #pragma once
2
+ /* ppef - Partitioned Elias-Fano encoding of a sequence of integers. */
3
+
4
+ #include <algorithm>
5
+ #include <chrono>
6
+ #include <fstream>
7
+ #include <iomanip>
8
+ #include <iostream>
9
+ #include <random>
10
+ #include <sstream>
11
+ #include <string>
12
+ #include <unordered_set>
13
+ #include <vector>
14
+ #include <cassert>
15
+ #include <cstdint>
16
+ #include <cstring>
17
+
18
+ #if defined(_MSC_VER) && !defined(__clang__)
19
+ #include <intrin.h>
20
+ #endif
21
+
22
+ namespace ppef {
23
+
24
+ // floor(log2(x))
25
+ inline uint32_t floor_log2_u64(uint64_t x);
26
+
27
+ // ceil(a / b)
28
+ inline uint64_t ceil_div_u64(uint64_t a, uint64_t b);
29
+
30
+ // number of trailing zero bits in an ULL - that is, the number
31
+ // of least significant zero bits before the first 1.
32
+ inline uint32_t ctz64(uint64_t x);
33
+
34
+ // bit position of the next set ('1') bit from the position *pos*
35
+ // onwards in a bitarray *H*.
36
+ inline uint64_t next_one_at_or_after(
37
+ const uint64_t *H, // size n_words
38
+ size_t n_words,
39
+ uint64_t pos
40
+ );
41
+
42
+ // return the index of the element in a sorted vector *v* corresponding
43
+ // to the smallest element greater than or equal to *q*
44
+ size_t supremum_index(const std::vector<uint64_t>& v, const uint64_t q);
45
+
46
+ /*
47
+ * Class: BitReader
48
+ * ----------------
49
+ * Read integers out of a densely-encoded bitarray.
50
+ */
51
+ struct BitReader {
52
+ // Words to read
53
+ const uint64_t* words = nullptr;
54
+ // Number of words available in *p*
55
+ size_t n_words = 0;
56
+ // Current word we're on
57
+ size_t idx = 0;
58
+ // Bits already consumed from the current word
59
+ unsigned consumed = 0;
60
+ // Current word (first *consumed* bits will have already been read)
61
+ uint64_t cur = 0;
62
+
63
+ // Construct from a raw bitstream, represented as a sequence of
64
+ // 64-bit "words".
65
+ BitReader(const uint64_t* words, size_t n_words);
66
+
67
+ // Read *w* bits from the input words, and return those bits
68
+ // packed into the least-significant positions of uint64_t.
69
+ // Doesn't make sense to use w > 64.
70
+ uint64_t get(unsigned w);
71
+
72
+ // Scan to a particular bit position *pos*.
73
+ void scan(uint64_t pos);
74
+ };
75
+
76
+ /*
77
+ * Class: BitWriter
78
+ * ----------------
79
+ * Pack integers densely into a bitarray.
80
+ */
81
+ struct BitWriter {
82
+ // finished words
83
+ std::vector<uint64_t> words;
84
+ // current word we're writing to
85
+ uint64_t cur = 0;
86
+ // number of bits already used in *cur*
87
+ unsigned filled = 0;
88
+
89
+ // Write the *w* least-significant bits from *val* into
90
+ // *words*, creating a new word if necessary.
91
+ void put(uint64_t val, unsigned w);
92
+
93
+ // Start a new word, regardless of how many bits we've used in
94
+ // the current word.
95
+ void flush();
96
+ };
97
+
98
+ /*
99
+ * Struct: EFBlockMetadata
100
+ * -----------------------
101
+ * Metadata for a PEF-compressed block of integers.
102
+ */
103
+ #pragma pack(push, 1)
104
+ struct EFBlockMetadata {
105
+ uint32_t n_elem; // total number of integers ("elements") in this block.
106
+ uint8_t l; // number of least significant bits in the "low" bits.
107
+ uint8_t pad[3]; // so that the whole block remains divisible by 8 bytes.
108
+ uint64_t floor; // the least element.
109
+ uint64_t low_words; // total 8-byte blocks in the low bit representation.
110
+ uint64_t high_words; // total 8-byte blocks in the high bit representation.
111
+ uint64_t high_bits_len; // total number of bits in the high bit representation.
112
+ // note: high_bits_len <= high_words * 64.
113
+ };
114
+ #pragma pack(pop)
115
+ // Desirable for maintaining byte alignment.
116
+ static_assert(
117
+ sizeof(EFBlockMetadata) == 40,
118
+ "EFBlockMetadata must be 40 bytes"
119
+ );
120
+
121
+ /*
122
+ * Struct: EFBlock
123
+ * ---------------
124
+ * Elias-Fano encoding of a non-decreasing sequence of integers.
125
+ */
126
+ struct EFBlock {
127
+ EFBlockMetadata meta {};
128
+ // Packed low bits
129
+ std::vector<uint64_t> low;
130
+ // Unary-encoded high bits
131
+ std::vector<uint64_t> high;
132
+
133
+ // Tries to move meta, low, and high
134
+ EFBlock(EFBlockMetadata meta, std::vector<uint64_t> low, std::vector<uint64_t> high);
135
+
136
+ // Choose how many bits from each integer to encode in the "low" vs.
137
+ // "high" parts. This optimizes the compression ratio for *n*
138
+ // integers uniformly distributed between 0 and *range*.
139
+ static inline uint32_t choose_l(uint64_t range, uint32_t n);
140
+
141
+ // Print out everything in this block to stdout.
142
+ void show() const;
143
+
144
+ // Construct from a raw sequence of integers.
145
+ EFBlock(const uint64_t* values, uint32_t n_elem);
146
+
147
+ // Decode to the original sequence of integers.
148
+ std::vector<uint64_t> decode() const;
149
+ };
150
+
151
+ /*
152
+ * Struct: SequenceMetadata
153
+ * -------------------
154
+ * Metadata relevant for a PPEF-compressed file. We write this once in
155
+ * the first 40 bytes of the file.
156
+ */
157
+ #pragma pack(push, 1)
158
+ struct SequenceMetadata {
159
+ char magic[4]; // file magic ("PPEF")
160
+ uint32_t version; // 1
161
+ uint64_t n_elem; // total number of compressed elements
162
+ uint32_t block_size; // compression block size (in # elements)
163
+ uint32_t reserved; // always 0
164
+ uint64_t n_blocks; // ceil(n_elem / block_size)
165
+ uint64_t payload_offset; // byte offset of actual data part of file
166
+ };
167
+ #pragma pack(pop)
168
+ // Desirable for byte-level alignment.
169
+ static_assert(sizeof(SequenceMetadata) == 40, "SequenceMetadata must be 40 bytes");
170
+
171
+ /*
172
+ * Class: Sequence
173
+ * ---------------
174
+ * A nondecreasing sequence of integers in partitioned Elias-Fano (PEF)
175
+ * format. Provides methods to serialize the sequence to a file.
176
+ */
177
+ class Sequence {
178
+ public:
179
+ // Construct an empty sequence.
180
+ explicit Sequence(uint32_t block_size = 256);
181
+
182
+ // Construct from a raw sequence of nondecreasing integers.
183
+ explicit Sequence(
184
+ const std::vector<uint64_t>& values, // must be sorted!
185
+ uint32_t block_size = 256
186
+ );
187
+
188
+ // Construct from a serialized representation.
189
+ explicit Sequence(std::istream& in);
190
+
191
+ // Construct from a compressed PPEF file.
192
+ explicit Sequence(const std::string& path);
193
+
194
+ // Copy constructor.
195
+ Sequence(const Sequence& other);
196
+
197
+ // Move constructor
198
+ Sequence(Sequence&&) noexcept;
199
+
200
+ // Serialize this Sequence in its compressed state to a string.
201
+ std::string serialize() const;
202
+
203
+ // Save serialized Sequence to a file.
204
+ void save(const std::string& path) const;
205
+
206
+ // Decode the i^th EFBlock, returning its original integers.
207
+ std::vector<uint64_t> decode_block(uint64_t i) const;
208
+
209
+ // Decode the entire original sequence.
210
+ std::vector<uint64_t> decode() const;
211
+
212
+ // Decode the i^th value in the sequence.
213
+ uint64_t get(uint64_t i) const;
214
+ uint64_t operator[](uint64_t i) const;
215
+
216
+ // Get the bi^th EFBlock in the sequence (without decoding).
217
+ EFBlock get_efblock(uint64_t bi) const;
218
+
219
+ // Check if an element exists
220
+ bool contains(uint64_t val) const;
221
+
222
+ // Intersect with another Sequence, returning a new Sequence
223
+ Sequence intersect(const Sequence& other) const;
224
+
225
+ // Take union with another Sequence, returning a new Sequence
226
+ Sequence operator|(const Sequence& other) const;
227
+
228
+ // Number of integers encoded in this Sequence.
229
+ uint64_t n_elem() const;
230
+
231
+ // Maximum number of integers per EFBlock.
232
+ uint32_t block_size() const;
233
+
234
+ // Total number of EFBlocks.
235
+ uint64_t n_blocks() const;
236
+
237
+ // Print all SequenceMetadata to stdout.
238
+ void info() const;
239
+
240
+ // Get a copy of the SequenceMetadata.
241
+ SequenceMetadata get_meta() const;
242
+
243
+ private:
244
+ SequenceMetadata meta {};
245
+ // Highest element in each block (size *n_blocks_*).
246
+ std::vector<uint64_t> block_last_;
247
+ // Byte offset of the start of each block in the file (size *n_blocks_*).
248
+ std::vector<uint64_t> block_offs_;
249
+ // All EFBlocks written end-to-end:
250
+ // header0, low0, high0, header1, low1, high1, ...
251
+ std::vector<uint8_t> payload_;
252
+
253
+ // Write a new chunk of data to *payload_*.
254
+ void append_bytes(const void* src, size_t n) {
255
+ size_t old = payload_.size();
256
+ payload_.resize(old + n);
257
+ std::memcpy(payload_.data() + old, src, n);
258
+ }
259
+
260
+ // Serialize this Sequence to an arbitrary ofstream
261
+ void serialize_to_stream(std::ostream&) const;
262
+
263
+ // Initialize from a serialized representation in a stream.
264
+ void init_from_stream(std::istream& in);
265
+ };
266
+
267
+ } // end namespace ppef
@@ -0,0 +1,19 @@
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=42",
4
+ "pybind11>=2.6.0",
5
+ "tomli",
6
+ ]
7
+ build-backend = "setuptools.build_meta"
8
+
9
+ [project]
10
+ name = "ppef"
11
+ version = "1.0.2"
12
+ description = "Partitioned Elias-Fano encoding for sequences of nondecreasing integers"
13
+ license-files = ["LICENSE"]
14
+ authors = [
15
+ { name = "Alec Heckert", email = "alecheckert@gmail.com" },
16
+ ]
17
+ dependencies = [
18
+ "numpy",
19
+ ]
ppef-1.0.2/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
ppef-1.0.2/setup.py ADDED
@@ -0,0 +1,33 @@
1
+ from setuptools import setup
2
+ from pybind11.setup_helpers import Pybind11Extension, build_ext
3
+ from pathlib import Path
4
+ from glob import glob
5
+ import os
6
+ import sys
7
+ import tomli
8
+
9
+
10
+ # SSOT __version__ from pyproject.toml
11
+ version = tomli.loads(Path("pyproject.toml").read_text(encoding="utf-8"))["project"][
12
+ "version"
13
+ ]
14
+
15
+
16
+ SRC_FILES = list(glob(os.path.join("src", "*.cpp")))
17
+ INCLUDE_DIRS = ["include"]
18
+ CXX_STD = 17 # can also be >=c++11
19
+
20
+ ext_modules = [
21
+ Pybind11Extension(
22
+ "ppef",
23
+ SRC_FILES,
24
+ include_dirs=INCLUDE_DIRS,
25
+ cxx_std=CXX_STD,
26
+ define_macros=[("VERSION_INFO", f'"{version}"')],
27
+ )
28
+ ]
29
+
30
+ setup(
31
+ ext_modules=ext_modules,
32
+ cmdclass={"build_ext": build_ext},
33
+ )
@@ -0,0 +1,61 @@
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/stl.h>
3
+ #include <pybind11/numpy.h>
4
+ #include "ppef.h"
5
+
6
+ namespace py = pybind11;
7
+
8
+ ppef::Sequence deserialize(const std::string& s) {
9
+ std::istringstream in(s);
10
+ return ppef::Sequence(in);
11
+ }
12
+
13
+ PYBIND11_MODULE(ppef, m) {
14
+ m.attr("__version__") = VERSION_INFO; // see setup.py
15
+ py::class_<ppef::SequenceMetadata>(m, "SequenceMetadata", py::module_local());
16
+ py::class_<ppef::Sequence>(m, "Sequence", py::module_local())
17
+ .def(
18
+ py::init<const std::string&>(),
19
+ py::arg("filepath")
20
+ )
21
+ .def(
22
+ py::init<const std::vector<uint64_t>&, uint32_t>(),
23
+ py::arg("values"),
24
+ py::arg("block_size") = 256
25
+ )
26
+ .def(
27
+ py::pickle(
28
+ // __getstate__
29
+ [](const ppef::Sequence& s) {
30
+ std::string o = s.serialize();
31
+ return py::bytes(o.data(), o.size());
32
+ },
33
+ // __setstate__
34
+ [](const py::bytes& b) {
35
+ std::istringstream in(b);
36
+ return ppef::Sequence(in);
37
+ }
38
+ )
39
+ )
40
+ .def_property_readonly("n_elem", &ppef::Sequence::n_elem)
41
+ .def_property_readonly("block_size", &ppef::Sequence::block_size)
42
+ .def_property_readonly("n_blocks", &ppef::Sequence::n_blocks)
43
+ .def("get_meta", &ppef::Sequence::get_meta)
44
+ .def("info", &ppef::Sequence::info)
45
+ .def("save", &ppef::Sequence::save, py::arg("filepath"))
46
+ .def("decode_block", &ppef::Sequence::decode_block, py::arg("block_idx"))
47
+ .def("decode", &ppef::Sequence::decode)
48
+ .def("__getitem__", &ppef::Sequence::get, py::arg("i"))
49
+ .def("__contains__", &ppef::Sequence::contains, py::arg("q"))
50
+ .def("__len__", &ppef::Sequence::n_elem)
51
+ .def("__and__", &ppef::Sequence::intersect, py::arg("other"))
52
+ .def("__or__", &ppef::Sequence::operator|, py::arg("other"))
53
+ .def(
54
+ "serialize",
55
+ [](const ppef::Sequence& s) {
56
+ std::string o = s.serialize();
57
+ return py::bytes(o.data(), o.size());
58
+ }
59
+ );
60
+ m.def("deserialize", &deserialize, py::arg("serialized"));
61
+ }