PyPI - ppef - Versions diffs - 1.0.2__tar.gz - Mend

ppef 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ppef might be problematic. Click here for more details.

Files changed (16) hide show

ppef-1.0.2/LICENSE +21 -0
ppef-1.0.2/MANIFEST.in +1 -0
ppef-1.0.2/PKG-INFO +8 -0
ppef-1.0.2/README.md +106 -0
ppef-1.0.2/include/ppef.h +267 -0
ppef-1.0.2/pyproject.toml +19 -0
ppef-1.0.2/setup.cfg +4 -0
ppef-1.0.2/setup.py +33 -0
ppef-1.0.2/src/bindings.cpp +61 -0
ppef-1.0.2/src/ppef.cpp +986 -0
ppef-1.0.2/src/ppef.egg-info/PKG-INFO +8 -0
ppef-1.0.2/src/ppef.egg-info/SOURCES.txt +14 -0
ppef-1.0.2/src/ppef.egg-info/dependency_links.txt +1 -0
ppef-1.0.2/src/ppef.egg-info/requires.txt +1 -0
ppef-1.0.2/src/ppef.egg-info/top_level.txt +1 -0
ppef-1.0.2/tests/test_bindings.py +97 -0

ppef-1.0.2/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Alec Heckert
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ppef-1.0.2/MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@
1	+ include include/ppef.h

ppef-1.0.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,8 @@
+Metadata-Version: 2.4
+Name: ppef
+Version: 1.0.2
+Summary: Partitioned Elias-Fano encoding for sequences of nondecreasing integers
+Author-email: Alec Heckert <alecheckert@gmail.com>
+License-File: LICENSE
+Requires-Dist: numpy
+Dynamic: license-file

ppef-1.0.2/README.md ADDED Viewed

@@ -0,0 +1,106 @@
+# ppef: Partitioned Elias-Fano encoding
+Compact C++11 : Python implementation of the partitioned Elias-Fano (PEF) encoding from Ottoviano & Venturini (https://doi.org/10.1145/2600428.2609615).
+Partly for fun (it's a neat method), partly because I needed a Python-facing implementation that was simple/hackable but still reasonably performant.
+The main interface is a `Sequence` object that provides a compressed in-memory representation of a nondecreasing sequence of unsigned integers. Following Ottoviano & Venturini, we divide this sequence into "blocks" that are each independently encoded with Elias-Fano using adaptive high/low bit ratios. This partitioning scheme increases compression efficiency: for large sets, we're usually able to get compression ratios of >=10-20X. Apart from that, the scheme has some other benefits that we exploit here:
+ - $O(1)$ random access _without decompression_
+ - $O(n+m)$ intersections and unions _without decompression_
+ - $O(\log (n))$ set membership tests _without decompression_
+ - Trivial $O(n)$ serialization and deserialization
+These properties make the `Sequence` well-suited to storing large inverted indices in search algorithms. All operations maintain sorting on the input sequence.
+"Without decompression" above means without decompressing the entire `Sequence`. We still need to decompress individual partitions, but we can do all of the above while only holding a single decompressed partition in memory at a time.
+Our implementation also has some other benefits:
+ - No external C/C++ dependencies
+ - Thin Python bindings
+ - Pickleable
+Limitations include:
+ - No insert operation; requires full decompression
+ - Currently doesn't support mmap (this is a future improvement)
+## Python example
+```
+import numpy as np
+from ppef import Sequence, deserialize
+# Sample a sequence of integers. These are uniformly distributed, which is
+# a worst-case situation for Elias-Fano encoding.
+values = np.random.randint(0, 1<<16, size=1<<22)
+values.sort()
+# Encode
+seq = Sequence(values)
+# Show some info
+seq.info()
+# Total number of compressed elements
+n_elements = len(seq)
+assert n_elements == len(values)
+# Random access: get the i^th element without decompressing
+idx = 5000
+val: int = seq[idx]
+assert val == values[idx]
+# Set membership testing
+val_is_present = val in seq
+assert val_is_present
+# Decode the entire sequence
+values: list[int] = seq.decode()
+# Decode only the 50th partition block
+chunk: list[int] = seq.decode_block(50)
+# Total number of partition blocks
+print(seq.n_blocks)
+# Serialize to a file
+seq.save("myfile.ppef")
+# Deserialize from a file
+seq2 = Sequence("myfile.ppef")
+# Serialize to a bytestring
+serialized: bytes = seq.serialize()
+# Deserialize from a bytestring
+seq2: Sequence = deserialize(serialized)
+# Define another Sequence for testing intersections and unions
+values2 = np.random.randint(0, 1<<16, size=1<<22)
+values2.sort()
+seq2 = Sequence(values2)
+# Get the intersection between two Sequences (without decompressing)
+new_seq: Sequence = seq & seq2
+# Get the union between two Sequences (without decompressing)
+new_seq: Sequence = seq | seq2
+```
+## Building, testing
+Compile the Python package:
+```
+pip install .
+```
+Build and run the C++ tests:
+```
+cd tests
+make
+./test_driver
+```
+Run the Python tests:
+```
+pytest tests
+```

ppef-1.0.2/include/ppef.h ADDED Viewed

@@ -0,0 +1,267 @@
+#pragma once
+/* ppef - Partitioned Elias-Fano encoding of a sequence of integers. */
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#if defined(_MSC_VER) && !defined(__clang__)
+  #include <intrin.h>
+#endif
+namespace ppef {
+// floor(log2(x))
+inline uint32_t floor_log2_u64(uint64_t x);
+// ceil(a / b)
+inline uint64_t ceil_div_u64(uint64_t a, uint64_t b);
+// number of trailing zero bits in an ULL - that is, the number
+// of least significant zero bits before the first 1.
+inline uint32_t ctz64(uint64_t x);
+// bit position of the next set ('1') bit from the position *pos*
+// onwards in a bitarray *H*.
+inline uint64_t next_one_at_or_after(
+    const uint64_t *H, // size n_words
+    size_t n_words,
+    uint64_t pos
+);
+// return the index of the element in a sorted vector *v* corresponding
+// to the smallest element greater than or equal to *q*
+size_t supremum_index(const std::vector<uint64_t>& v, const uint64_t q);
+/*
+ * Class: BitReader
+ * ----------------
+ * Read integers out of a densely-encoded bitarray.
+*/
+struct BitReader {
+    // Words to read
+    const uint64_t* words = nullptr;
+    // Number of words available in *p*
+    size_t n_words = 0;
+    // Current word we're on
+    size_t idx = 0;
+    // Bits already consumed from the current word
+    unsigned consumed = 0;
+    // Current word (first *consumed* bits will have already been read)
+    uint64_t cur = 0;
+    // Construct from a raw bitstream, represented as a sequence of
+    // 64-bit "words".
+    BitReader(const uint64_t* words, size_t n_words);
+    // Read *w* bits from the input words, and return those bits
+    // packed into the least-significant positions of uint64_t.
+    // Doesn't make sense to use w > 64.
+    uint64_t get(unsigned w);
+    // Scan to a particular bit position *pos*.
+    void scan(uint64_t pos);
+};
+/*
+ * Class: BitWriter
+ * ----------------
+ * Pack integers densely into a bitarray.
+*/
+struct BitWriter {
+    // finished words
+    std::vector<uint64_t> words;
+    // current word we're writing to
+    uint64_t cur = 0;
+    // number of bits already used in *cur*
+    unsigned filled = 0;
+    // Write the *w* least-significant bits from *val* into
+    // *words*, creating a new word if necessary.
+    void put(uint64_t val, unsigned w);
+    // Start a new word, regardless of how many bits we've used in
+    // the current word.
+    void flush();
+};
+/*
+ * Struct: EFBlockMetadata
+ * -----------------------
+ * Metadata for a PEF-compressed block of integers.
+*/
+#pragma pack(push, 1)
+struct EFBlockMetadata {
+    uint32_t n_elem;         // total number of integers ("elements") in this block.
+    uint8_t  l;              // number of least significant bits in the "low" bits.
+    uint8_t  pad[3];         // so that the whole block remains divisible by 8 bytes.
+    uint64_t floor;          // the least element.
+    uint64_t low_words;      // total 8-byte blocks in the low bit representation.
+    uint64_t high_words;     // total 8-byte blocks in the high bit representation.
+    uint64_t high_bits_len;  // total number of bits in the high bit representation.
+                             // note: high_bits_len <= high_words * 64.
+};
+#pragma pack(pop)
+// Desirable for maintaining byte alignment.
+static_assert(
+    sizeof(EFBlockMetadata) == 40,
+    "EFBlockMetadata must be 40 bytes"
+);
+/*
+ * Struct: EFBlock
+ * ---------------
+ * Elias-Fano encoding of a non-decreasing sequence of integers.
+*/
+struct EFBlock {
+    EFBlockMetadata meta {};
+    // Packed low bits
+    std::vector<uint64_t> low;
+    // Unary-encoded high bits
+    std::vector<uint64_t> high;
+    // Tries to move meta, low, and high
+    EFBlock(EFBlockMetadata meta, std::vector<uint64_t> low, std::vector<uint64_t> high);
+    // Choose how many bits from each integer to encode in the "low" vs.
+    // "high" parts. This optimizes the compression ratio for *n*
+    // integers uniformly distributed between 0 and *range*.
+    static inline uint32_t choose_l(uint64_t range, uint32_t n);
+    // Print out everything in this block to stdout.
+    void show() const;
+    // Construct from a raw sequence of integers.
+    EFBlock(const uint64_t* values, uint32_t n_elem);
+    // Decode to the original sequence of integers.
+    std::vector<uint64_t> decode() const;
+};
+/*
+ * Struct: SequenceMetadata
+ * -------------------
+ * Metadata relevant for a PPEF-compressed file. We write this once in
+ * the first 40 bytes of the file.
+*/
+#pragma pack(push, 1)
+struct SequenceMetadata {
+    char     magic[4];       // file magic ("PPEF")
+    uint32_t version;        // 1
+    uint64_t n_elem;         // total number of compressed elements
+    uint32_t block_size;     // compression block size (in # elements)
+    uint32_t reserved;       // always 0
+    uint64_t n_blocks;       // ceil(n_elem / block_size)
+    uint64_t payload_offset; // byte offset of actual data part of file
+};
+#pragma pack(pop)
+// Desirable for byte-level alignment.
+static_assert(sizeof(SequenceMetadata) == 40, "SequenceMetadata must be 40 bytes");
+/*
+ * Class: Sequence
+ * ---------------
+ * A nondecreasing sequence of integers in partitioned Elias-Fano (PEF)
+ * format. Provides methods to serialize the sequence to a file.
+*/
+class Sequence {
+public:
+    // Construct an empty sequence.
+    explicit Sequence(uint32_t block_size = 256);
+    // Construct from a raw sequence of nondecreasing integers.
+    explicit Sequence(
+        const std::vector<uint64_t>& values, // must be sorted!
+        uint32_t block_size = 256
+    );
+    // Construct from a serialized representation.
+    explicit Sequence(std::istream& in);
+    // Construct from a compressed PPEF file.
+    explicit Sequence(const std::string& path);
+    // Copy constructor.
+    Sequence(const Sequence& other);
+    // Move constructor
+    Sequence(Sequence&&) noexcept;
+    // Serialize this Sequence in its compressed state to a string.
+    std::string serialize() const;
+    // Save serialized Sequence to a file.
+    void save(const std::string& path) const;
+    // Decode the i^th EFBlock, returning its original integers.
+    std::vector<uint64_t> decode_block(uint64_t i) const;
+    // Decode the entire original sequence.
+    std::vector<uint64_t> decode() const;
+    // Decode the i^th value in the sequence.
+    uint64_t get(uint64_t i) const;
+    uint64_t operator[](uint64_t i) const;
+    // Get the bi^th EFBlock in the sequence (without decoding).
+    EFBlock get_efblock(uint64_t bi) const;
+    // Check if an element exists
+    bool contains(uint64_t val) const;
+    // Intersect with another Sequence, returning a new Sequence
+    Sequence intersect(const Sequence& other) const;
+    // Take union with another Sequence, returning a new Sequence
+    Sequence operator|(const Sequence& other) const;
+    // Number of integers encoded in this Sequence.
+    uint64_t n_elem() const;
+    // Maximum number of integers per EFBlock.
+    uint32_t block_size() const;
+    // Total number of EFBlocks.
+    uint64_t n_blocks() const;
+    // Print all SequenceMetadata to stdout.
+    void info() const;
+    // Get a copy of the SequenceMetadata.
+    SequenceMetadata get_meta() const;
+private:
+    SequenceMetadata meta {};
+    // Highest element in each block (size *n_blocks_*).
+    std::vector<uint64_t> block_last_;
+    // Byte offset of the start of each block in the file (size *n_blocks_*).
+    std::vector<uint64_t> block_offs_;
+    // All EFBlocks written end-to-end:
+    // header0, low0, high0, header1, low1, high1, ...
+    std::vector<uint8_t> payload_;
+    // Write a new chunk of data to *payload_*.
+    void append_bytes(const void* src, size_t n) {
+        size_t old = payload_.size();
+        payload_.resize(old + n);
+        std::memcpy(payload_.data() + old, src, n);
+    }
+    // Serialize this Sequence to an arbitrary ofstream
+    void serialize_to_stream(std::ostream&) const;
+    // Initialize from a serialized representation in a stream.
+    void init_from_stream(std::istream& in);
+};
+} // end namespace ppef

ppef-1.0.2/pyproject.toml ADDED Viewed

@@ -0,0 +1,19 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "pybind11>=2.6.0",
+    "tomli",
+]
+build-backend = "setuptools.build_meta"
+[project]
+name = "ppef"
+version = "1.0.2"
+description = "Partitioned Elias-Fano encoding for sequences of nondecreasing integers"
+license-files = ["LICENSE"]
+authors = [
+    { name = "Alec Heckert", email = "alecheckert@gmail.com" },
+]
+dependencies = [
+    "numpy",
+]

ppef-1.0.2/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

ppef-1.0.2/setup.py ADDED Viewed

@@ -0,0 +1,33 @@
+from setuptools import setup
+from pybind11.setup_helpers import Pybind11Extension, build_ext
+from pathlib import Path
+from glob import glob
+import os
+import sys
+import tomli
+# SSOT __version__ from pyproject.toml
+version = tomli.loads(Path("pyproject.toml").read_text(encoding="utf-8"))["project"][
+    "version"
+]
+SRC_FILES = list(glob(os.path.join("src", "*.cpp")))
+INCLUDE_DIRS = ["include"]
+CXX_STD = 17  # can also be >=c++11
+ext_modules = [
+    Pybind11Extension(
+        "ppef",
+        SRC_FILES,
+        include_dirs=INCLUDE_DIRS,
+        cxx_std=CXX_STD,
+        define_macros=[("VERSION_INFO", f'"{version}"')],
+    )
+]
+setup(
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": build_ext},
+)

ppef-1.0.2/src/bindings.cpp ADDED Viewed

@@ -0,0 +1,61 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include "ppef.h"
+namespace py = pybind11;
+ppef::Sequence deserialize(const std::string& s) {
+    std::istringstream in(s);
+    return ppef::Sequence(in);
+}
+PYBIND11_MODULE(ppef, m) {
+    m.attr("__version__") = VERSION_INFO; // see setup.py
+    py::class_<ppef::SequenceMetadata>(m, "SequenceMetadata", py::module_local());
+    py::class_<ppef::Sequence>(m, "Sequence", py::module_local())
+        .def(
+            py::init<const std::string&>(),
+            py::arg("filepath")
+        )
+        .def(
+            py::init<const std::vector<uint64_t>&, uint32_t>(),
+            py::arg("values"),
+            py::arg("block_size") = 256
+        )
+        .def(
+            py::pickle(
+                // __getstate__
+                [](const ppef::Sequence& s) {
+                    std::string o = s.serialize();
+                    return py::bytes(o.data(), o.size());
+                },
+                // __setstate__
+                [](const py::bytes& b) {
+                    std::istringstream in(b);
+                    return ppef::Sequence(in);
+                }
+            )
+        )
+        .def_property_readonly("n_elem", &ppef::Sequence::n_elem)
+        .def_property_readonly("block_size", &ppef::Sequence::block_size)
+        .def_property_readonly("n_blocks", &ppef::Sequence::n_blocks)
+        .def("get_meta", &ppef::Sequence::get_meta)
+        .def("info", &ppef::Sequence::info)
+        .def("save", &ppef::Sequence::save, py::arg("filepath"))
+        .def("decode_block", &ppef::Sequence::decode_block, py::arg("block_idx"))
+        .def("decode", &ppef::Sequence::decode)
+        .def("__getitem__", &ppef::Sequence::get, py::arg("i"))
+        .def("__contains__", &ppef::Sequence::contains, py::arg("q"))
+        .def("__len__", &ppef::Sequence::n_elem)
+        .def("__and__", &ppef::Sequence::intersect, py::arg("other"))
+        .def("__or__", &ppef::Sequence::operator|, py::arg("other"))
+        .def(
+            "serialize",
+            [](const ppef::Sequence& s) {
+                std::string o = s.serialize();
+                return py::bytes(o.data(), o.size());
+            }
+        );
+    m.def("deserialize", &deserialize, py::arg("serialized"));
+}