PyPI - plocate2 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

plocate2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

plocate/__init__.py +21 -0
plocate/binary_reader.py +58 -0
plocate/config.py +53 -0
plocate/constants.py +3 -0
plocate/database.py +284 -0
plocate/directory_data.py +115 -0
plocate/entrypoint/__init__.py +0 -0
plocate/entrypoint/export.py +68 -0
plocate/entrypoint/search.py +136 -0
plocate/entrypoint/stats.py +63 -0
plocate/errors.py +8 -0
plocate/export.py +120 -0
plocate/filename_index.py +73 -0
plocate/formatting.py +92 -0
plocate/header.py +78 -0
plocate/indexed_entry.py +19 -0
plocate/indexed_search.py +98 -0
plocate/patterns.py +166 -0
plocate/posting_list.py +436 -0
plocate/search.py +219 -0
plocate/stats.py +102 -0
plocate/trigram_index.py +152 -0
plocate/trigram_patterns.py +163 -0
plocate2-0.1.0.dist-info/METADATA +297 -0
plocate2-0.1.0.dist-info/RECORD +28 -0
plocate2-0.1.0.dist-info/WHEEL +4 -0
plocate2-0.1.0.dist-info/entry_points.txt +4 -0
plocate2-0.1.0.dist-info/licenses/LICENSE +21 -0

plocate/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Read and search plocate.db index files."""
+import plocate.database
+import plocate.export
+import plocate.search
+PlocateDatabase = plocate.database.PlocateDatabase
+ExportOptions = plocate.export.ExportOptions
+SearchOptions = plocate.search.SearchOptions
+iter_export_records = plocate.export.iter_export_records
+search_database = plocate.search.search_database
+__all__ = [
+    "ExportOptions",
+    "PlocateDatabase",
+    "SearchOptions",
+    "iter_export_records",
+    "search_database",
+]

plocate/binary_reader.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Binary file access for plocate databases."""
+import mmap
+import os
+import typing
+import plocate.errors
+class BinaryReader:
+    """Read byte ranges from an open database file, optionally via mmap."""
+    def __init__(self, file_object: typing.BinaryIO) -> None:
+        """Attach to an open binary file and optionally map it read-only."""
+        self._file_object = file_object
+        # Measure file size and rewind to the start.
+        file_object.seek(0, os.SEEK_END)
+        self.file_size = file_object.tell()
+        file_object.seek(0)
+        self._mmap: mmap.mmap | None = None
+        # Prefer mmap when the file object supports fileno().
+        if hasattr(file_object, "fileno"):
+            try:
+                self._mmap = mmap.mmap(file_object.fileno(), 0, access=mmap.ACCESS_READ)
+            except (OSError, ValueError, BufferError):
+                self._mmap = None
+    def close(self) -> None:
+        """Release mmap and close the underlying file object."""
+        if self._mmap is not None:
+            self._mmap.close()
+            self._mmap = None
+        self._file_object.close()
+    def read_bytes(self, offset: int, length: int) -> bytes:
+        """Read length bytes starting at offset, using mmap when available."""
+        if length == 0:
+            return b""
+        if self._mmap is not None:
+            return self._mmap[offset : offset + length]
+        self._file_object.seek(offset)
+        data = self._file_object.read(length)
+        if len(data) != length:
+            message = "unexpected end of file while reading {length} bytes at offset {offset}".format(
+                length=length,
+                offset=offset,
+            )
+            raise plocate.errors.PlocateFormatError(message)
+        return data

plocate/config.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Configuration block parsing."""
+import dataclasses
+@dataclasses.dataclass(frozen=True, slots=True)
+class ConfigurationEntry:
+    """One updatedb configuration variable and its ordered values."""
+    name: str
+    values: list[str]
+def parse_configuration_block(block_bytes: bytes) -> list[ConfigurationEntry]:
+    """Parse the NUL-delimited configuration block from a plocate database."""
+    entries: list[ConfigurationEntry] = []
+    current_name: str | None = None
+    current_values: list[str] = []
+    index = 0
+    # Walk NUL-terminated strings: name, values..., empty string ends each entry.
+    while index < len(block_bytes):
+        end = block_bytes.find(b"\0", index)
+        if end == -1:
+            break
+        value = block_bytes[index:end].decode("utf-8")
+        index = end + 1
+        if current_name is None:
+            current_name = value
+            current_values = []
+            continue
+        if value == "":
+            entries.append(ConfigurationEntry(name=current_name, values=current_values))
+            current_name = None
+            continue
+        current_values.append(value)
+    return entries
+def configuration_entries_to_mapping(entries: list[ConfigurationEntry]) -> dict[str, list[str]]:
+    """Convert parsed configuration entries to a name-to-values mapping."""
+    mapping: dict[str, list[str]] = {}
+    for entry in entries:
+        mapping[entry.name] = entry.values
+    return mapping

plocate/constants.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Shared constants."""
+DEFAULT_DATABASE_PATH = "/var/lib/plocate/plocate.db"

plocate/database.py ADDED Viewed

@@ -0,0 +1,284 @@
+"""Core plocate.db reader."""
+import collections.abc
+import os
+import typing
+import zstandard
+import plocate.binary_reader
+import plocate.config
+import plocate.directory_data
+import plocate.errors
+import plocate.filename_index
+import plocate.header
+import plocate.indexed_entry
+import plocate.stats
+import plocate.trigram_index
+class PlocateDatabase:
+    """Reader for a plocate.db index file."""
+    def __init__(self, file_object: typing.BinaryIO, *, path: str | None = None) -> None:
+        """Open a plocate database from a readable binary file object."""
+        self._reader = plocate.binary_reader.BinaryReader(file_object)
+        self._path = path
+        self._decompressor: zstandard.ZstdDecompressor | None = None
+        self._filename_offsets: tuple[int, ...] | None = None
+        self._directory_time_entries: tuple[plocate.directory_data.DirectoryTimeEntry, ...] | None = None
+        self._directory_time_entries_loaded = False
+        self._trigram_index: plocate.trigram_index.TrigramIndex | None = None
+        self._trigram_index_loaded = False
+        # Parse the fixed header and prepare decompression.
+        header_bytes = self._reader.read_bytes(0, plocate.header.HEADER_STRUCT.size)
+        try:
+            self.header = plocate.header.PlocateHeader.from_bytes(header_bytes)
+        except ValueError as error:
+            raise plocate.errors.PlocateFormatError(str(error)) from error
+        dictionary_bytes = self._load_dictionary_bytes()
+        self._decompressor = plocate.filename_index.build_zstd_decompressor(dictionary_bytes)
+    @classmethod
+    def open(cls, path: str) -> typing.Self:
+        """Open a plocate database file from its path."""
+        file_object = open(path, "rb")
+        database = cls(file_object, path=path)
+        return database
+    @property
+    def path(self) -> str | None:
+        """Return the filesystem path passed to open(), if any."""
+        return self._path
+    @property
+    def file_size(self) -> int:
+        """Return the on-disk size of the database file in bytes."""
+        return self._reader.file_size
+    def file_mtime(self) -> float:
+        """Return the filesystem modification time for this open database in seconds."""
+        if self._path is None:
+            message = "database file modification time requires a filesystem path"
+            raise plocate.errors.PlocateDatabaseError(message)
+        stat_result = os.stat(self._path)
+        return stat_result.st_mtime
+    def close(self) -> None:
+        """Close the underlying database file."""
+        self._reader.close()
+    def __enter__(self) -> typing.Self:
+        """Enter a context manager that closes on exit."""
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        """Close the database when leaving a context manager."""
+        self.close()
+    def _load_dictionary_bytes(self) -> bytes | None:
+        """Return the embedded zstd dictionary bytes, if present."""
+        if self.header.version == 0:
+            return None
+        if self.header.zstd_dictionary_length_bytes == 0:
+            return None
+        dictionary_bytes = self._reader.read_bytes(
+            self.header.zstd_dictionary_offset_bytes,
+            self.header.zstd_dictionary_length_bytes,
+        )
+        return dictionary_bytes
+    def filename_block_offsets(self) -> tuple[int, ...]:
+        """Return the cached filename block offset table."""
+        if self._filename_offsets is not None:
+            return self._filename_offsets
+        # Read and parse the uint64 offset index.
+        index_length = (self.header.num_docids + 1) * 8
+        index_bytes = self._reader.read_bytes(
+            self.header.filename_index_offset_bytes,
+            index_length,
+        )
+        offsets = plocate.filename_index.read_filename_block_offsets(
+            index_bytes,
+            self.header.num_docids,
+        )
+        self._filename_offsets = offsets
+        return self._filename_offsets
+    def read_configuration_block(self) -> list[plocate.config.ConfigurationEntry]:
+        """Return updatedb configuration entries stored in the database."""
+        if self.header.max_version < 2 or self.header.conf_block_length_bytes == 0:
+            return []
+        block_bytes = self._reader.read_bytes(
+            self.header.conf_block_offset_bytes,
+            self.header.conf_block_length_bytes,
+        )
+        entries = plocate.config.parse_configuration_block(block_bytes)
+        return entries
+    def _load_directory_time_entries(
+        self,
+    ) -> tuple[plocate.directory_data.DirectoryTimeEntry, ...] | None:
+        """Return parsed directory timestamp entries, if present."""
+        if self._directory_time_entries_loaded:
+            return self._directory_time_entries
+        self._directory_time_entries_loaded = True
+        if self.header.max_version < 2:
+            return None
+        if self.header.directory_data_length_bytes == 0:
+            return None
+        # Read and decompress the parallel directory timestamp stream.
+        compressed = self._reader.read_bytes(
+            self.header.directory_data_offset_bytes,
+            self.header.directory_data_length_bytes,
+        )
+        decompressed = plocate.directory_data.decompress_directory_data_bytes(compressed)
+        entries = plocate.directory_data.parse_directory_time_entries(decompressed)
+        self._directory_time_entries = entries
+        return self._directory_time_entries
+    def _load_trigram_index(self) -> plocate.trigram_index.TrigramIndex | None:
+        """Return the parsed trigram index when present."""
+        if self._trigram_index_loaded:
+            return self._trigram_index
+        self._trigram_index_loaded = True
+        hash_table_offset = self.header.hash_table_offset_bytes
+        hash_table_size = self.header.hashtable_size
+        extra_hash_slots = self.header.extra_ht_slots
+        entry_count = hash_table_size + extra_hash_slots + 1
+        table_length = entry_count * plocate.trigram_index.TRIGRAM_STRUCT.size
+        if hash_table_offset + table_length > self.file_size:
+            return None
+        # Read and parse the trigram hash table when it is present on disk.
+        table_bytes = self._reader.read_bytes(hash_table_offset, table_length)
+        table_entries = plocate.trigram_index.parse_trigram_table(table_bytes)
+        self._trigram_index = plocate.trigram_index.TrigramIndex(
+            self._reader,
+            table_entries,
+            hash_table_size=hash_table_size,
+            extra_hash_slots=extra_hash_slots,
+        )
+        return self._trigram_index
+    def has_trigram_index(self) -> bool:
+        """Return whether this database contains a readable trigram index."""
+        trigram_index = self._load_trigram_index()
+        has_index = trigram_index is not None
+        return has_index
+    def trigram_index(self) -> plocate.trigram_index.TrigramIndex | None:
+        """Return the parsed trigram index when present."""
+        return self._load_trigram_index()
+    def read_filename_block(self, docid: int) -> list[str]:
+        """Return decompressed paths for one filename block docid."""
+        offsets = self.filename_block_offsets()
+        start = offsets[docid]
+        end = offsets[docid + 1]
+        compressed = self._reader.read_bytes(start, end - start)
+        assert self._decompressor is not None
+        block_paths = plocate.filename_index.decompress_filename_block(compressed, self._decompressor)
+        return block_paths
+    def iter_filename_blocks(self) -> collections.abc.Iterator[list[str]]:
+        """Yield decompressed path lists for each filename block."""
+        offsets = self.filename_block_offsets()
+        assert self._decompressor is not None
+        docid_indices = range(self.header.num_docids)
+        for docid in docid_indices:
+            block_paths = self.read_filename_block(docid)
+            yield block_paths
+    def iter_paths(self) -> collections.abc.Iterator[str]:
+        """Yield every indexed path in document order."""
+        blocks = self.iter_filename_blocks()
+        for block_paths in blocks:
+            for path in block_paths:
+                yield path
+    def iter_indexed_entries(self) -> collections.abc.Iterator[plocate.indexed_entry.IndexedEntry]:
+        """Yield indexed paths with docid, header, and directory metadata."""
+        directory_time_entries = self._load_directory_time_entries()
+        directory_time_index = 0
+        docid = 0
+        # Walk filename blocks and pair each path with metadata in order.
+        blocks = self.iter_filename_blocks()
+        for block_paths in blocks:
+            block_index = 0
+            for path in block_paths:
+                directory_time = None
+                if directory_time_entries is not None:
+                    if directory_time_index >= len(directory_time_entries):
+                        message = "directory timestamp stream ended before indexed paths"
+                        raise plocate.errors.PlocateFormatError(message)
+                    directory_time = directory_time_entries[directory_time_index]
+                    directory_time_index += 1
+                entry = plocate.indexed_entry.IndexedEntry(
+                    path=path,
+                    docid=docid,
+                    block_index=block_index,
+                    database_version=self.header.version,
+                    max_version=self.header.max_version,
+                    check_visibility=self.header.check_visibility,
+                    directory_time=directory_time,
+                )
+                yield entry
+                block_index += 1
+            docid += 1
+        if directory_time_entries is not None and directory_time_index != len(directory_time_entries):
+            message = "directory timestamp stream has {extra_count} extra entries".format(
+                extra_count=len(directory_time_entries) - directory_time_index,
+            )
+            raise plocate.errors.PlocateFormatError(message)
+    def statistics(self) -> plocate.stats.DatabaseStatistics:
+        """Collect summary statistics for this database."""
+        statistics = plocate.stats.collect_statistics(self)
+        return statistics

plocate/directory_data.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""Directory timestamp stream parsing for plocate databases."""
+import collections.abc
+import dataclasses
+import io
+import struct
+import zstandard
+import plocate.errors
+DIRECTORY_TIME_FILE_MARKER = 0
+DIRECTORY_TIME_DIRECTORY_MARKER = 1
+DIRECTORY_TIME_DIRECTORY_BODY_STRUCT = struct.Struct("<qi")
+@dataclasses.dataclass(frozen=True, slots=True)
+class DirectoryTimeEntry:
+    """Directory timestamp metadata aligned with one indexed path."""
+    is_directory: bool
+    seconds: int | None = None
+    nanoseconds: int | None = None
+def _encode_directory_time_entry(entry: DirectoryTimeEntry) -> bytes:
+    """Encode one directory timestamp entry for tests and fixtures."""
+    if not entry.is_directory:
+        encoded = bytes([DIRECTORY_TIME_FILE_MARKER])
+        return encoded
+    if entry.seconds is None or entry.nanoseconds is None:
+        message = "directory entries require seconds and nanoseconds"
+        raise ValueError(message)
+    encoded = bytes([DIRECTORY_TIME_DIRECTORY_MARKER])
+    encoded += DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.pack(entry.seconds, entry.nanoseconds)
+    return encoded
+def _encode_directory_time_block(entries: collections.abc.Sequence[DirectoryTimeEntry]) -> bytes:
+    """Encode a directory timestamp block from ordered entries."""
+    block_parts: list[bytes] = []
+    for entry in entries:
+        encoded_entry = _encode_directory_time_entry(entry)
+        block_parts.append(encoded_entry)
+    block = b"".join(block_parts)
+    return block
+def _compress_directory_time_block(block_bytes: bytes) -> bytes:
+    """Compress a directory timestamp block using a zstd stream."""
+    compressor = zstandard.ZstdCompressor()
+    buffer = io.BytesIO()
+    stream_writer = compressor.stream_writer(buffer)
+    stream_writer.write(block_bytes)
+    stream_writer.flush(zstandard.FLUSH_FRAME)
+    compressed = buffer.getvalue()
+    stream_writer.close()
+    return compressed
+def decompress_directory_data_bytes(compressed: bytes) -> bytes:
+    """Decompress a zstd directory timestamp stream."""
+    decompressor = zstandard.ZstdDecompressor()
+    buffer = io.BytesIO(compressed)
+    stream_reader = decompressor.stream_reader(buffer)
+    decompressed = stream_reader.read()
+    stream_reader.close()
+    return decompressed
+def parse_directory_time_entries(
+    block_bytes: bytes,
+) -> tuple[DirectoryTimeEntry, ...]:
+    """Parse decompressed directory timestamp bytes into ordered entries."""
+    entries: list[DirectoryTimeEntry] = []
+    index = 0
+    # Each entry begins with a marker byte for file versus directory.
+    while index < len(block_bytes):
+        marker = block_bytes[index]
+        index += 1
+        if marker == DIRECTORY_TIME_FILE_MARKER:
+            entry = DirectoryTimeEntry(is_directory=False)
+            entries.append(entry)
+            continue
+        if marker == DIRECTORY_TIME_DIRECTORY_MARKER:
+            if index + DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.size > len(block_bytes):
+                message = "truncated directory timestamp entry at byte {index}".format(index=index - 1)
+                raise plocate.errors.PlocateFormatError(message)
+            seconds, nanoseconds = DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.unpack_from(block_bytes, index)
+            index += DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.size
+            entry = DirectoryTimeEntry(
+                is_directory=True,
+                seconds=seconds,
+                nanoseconds=nanoseconds,
+            )
+            entries.append(entry)
+            continue
+        message = "unsupported directory timestamp marker {marker}".format(marker=marker)
+        raise plocate.errors.PlocateFormatError(message)
+    return tuple(entries)

plocate/entrypoint/__init__.py ADDED Viewed

File without changes

plocate/entrypoint/export.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Export indexed paths from a plocate database as JSON Lines."""
+import argparse
+import sys
+import plocate.constants
+import plocate.database
+import plocate.errors
+import plocate.export
+def _build_parser() -> argparse.ArgumentParser:
+    """Build the pl_export argument parser."""
+    parser = argparse.ArgumentParser(description="Export indexed paths from a plocate database as JSON Lines.")
+    parser.add_argument(
+        "database",
+        nargs="?",
+        default=plocate.constants.DEFAULT_DATABASE_PATH,
+        help="path to plocate.db (default: {default_path})".format(
+            default_path=plocate.constants.DEFAULT_DATABASE_PATH,
+        ),
+    )
+    parser.add_argument(
+        "--include",
+        metavar="PATTERN",
+        help="export only paths matching this fnmatch pattern",
+    )
+    return parser
+def _build_export_options(arguments: argparse.Namespace) -> plocate.export.ExportOptions:
+    """Translate parsed CLI arguments into export options."""
+    options = plocate.export.ExportOptions(
+        include_pattern=arguments.include,
+    )
+    return options
+def main(argv: list[str] | None = None) -> None:
+    """Parse argv and print indexed paths as JSON Lines."""
+    parser = _build_parser()
+    arguments = parser.parse_args(argv)
+    options = _build_export_options(arguments)
+    # Open the database and stream matching export records.
+    try:
+        with plocate.database.PlocateDatabase.open(arguments.database) as database:
+            record_iterator = plocate.export.iter_export_records(database, options=options)
+            for record in record_iterator:
+                line = plocate.export.format_export_record_jsonl(record)
+                sys.stdout.write(line)
+    except (plocate.errors.PlocateDatabaseError, OSError) as error:
+        message = "pl_export: {error}".format(error=error)
+        print(message, file=sys.stderr)
+        sys.exit(1)
+    sys.exit(0)
+if __name__ == "__main__":
+    main()