plocate2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
plocate/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """Read and search plocate.db index files."""
2
+
3
+ import plocate.database
4
+ import plocate.export
5
+ import plocate.search
6
+
7
+
8
+ PlocateDatabase = plocate.database.PlocateDatabase
9
+ ExportOptions = plocate.export.ExportOptions
10
+ SearchOptions = plocate.search.SearchOptions
11
+
12
+ iter_export_records = plocate.export.iter_export_records
13
+ search_database = plocate.search.search_database
14
+
15
+ __all__ = [
16
+ "ExportOptions",
17
+ "PlocateDatabase",
18
+ "SearchOptions",
19
+ "iter_export_records",
20
+ "search_database",
21
+ ]
@@ -0,0 +1,58 @@
1
+ """Binary file access for plocate databases."""
2
+
3
+ import mmap
4
+ import os
5
+ import typing
6
+
7
+ import plocate.errors
8
+
9
+
10
+
11
+ class BinaryReader:
12
+ """Read byte ranges from an open database file, optionally via mmap."""
13
+
14
+ def __init__(self, file_object: typing.BinaryIO) -> None:
15
+ """Attach to an open binary file and optionally map it read-only."""
16
+
17
+ self._file_object = file_object
18
+
19
+ # Measure file size and rewind to the start.
20
+ file_object.seek(0, os.SEEK_END)
21
+ self.file_size = file_object.tell()
22
+ file_object.seek(0)
23
+ self._mmap: mmap.mmap | None = None
24
+
25
+ # Prefer mmap when the file object supports fileno().
26
+ if hasattr(file_object, "fileno"):
27
+ try:
28
+ self._mmap = mmap.mmap(file_object.fileno(), 0, access=mmap.ACCESS_READ)
29
+ except (OSError, ValueError, BufferError):
30
+ self._mmap = None
31
+
32
+ def close(self) -> None:
33
+ """Release mmap and close the underlying file object."""
34
+
35
+ if self._mmap is not None:
36
+ self._mmap.close()
37
+ self._mmap = None
38
+ self._file_object.close()
39
+
40
+ def read_bytes(self, offset: int, length: int) -> bytes:
41
+ """Read length bytes starting at offset, using mmap when available."""
42
+
43
+ if length == 0:
44
+ return b""
45
+
46
+ if self._mmap is not None:
47
+ return self._mmap[offset : offset + length]
48
+
49
+ self._file_object.seek(offset)
50
+ data = self._file_object.read(length)
51
+ if len(data) != length:
52
+ message = "unexpected end of file while reading {length} bytes at offset {offset}".format(
53
+ length=length,
54
+ offset=offset,
55
+ )
56
+ raise plocate.errors.PlocateFormatError(message)
57
+
58
+ return data
plocate/config.py ADDED
@@ -0,0 +1,53 @@
1
+ """Configuration block parsing."""
2
+
3
+ import dataclasses
4
+
5
+
6
+
7
+ @dataclasses.dataclass(frozen=True, slots=True)
8
+ class ConfigurationEntry:
9
+ """One updatedb configuration variable and its ordered values."""
10
+
11
+ name: str
12
+ values: list[str]
13
+
14
+
15
+ def parse_configuration_block(block_bytes: bytes) -> list[ConfigurationEntry]:
16
+ """Parse the NUL-delimited configuration block from a plocate database."""
17
+
18
+ entries: list[ConfigurationEntry] = []
19
+ current_name: str | None = None
20
+ current_values: list[str] = []
21
+ index = 0
22
+
23
+ # Walk NUL-terminated strings: name, values..., empty string ends each entry.
24
+ while index < len(block_bytes):
25
+ end = block_bytes.find(b"\0", index)
26
+ if end == -1:
27
+ break
28
+ value = block_bytes[index:end].decode("utf-8")
29
+ index = end + 1
30
+
31
+ if current_name is None:
32
+ current_name = value
33
+ current_values = []
34
+ continue
35
+
36
+ if value == "":
37
+ entries.append(ConfigurationEntry(name=current_name, values=current_values))
38
+ current_name = None
39
+ continue
40
+
41
+ current_values.append(value)
42
+
43
+ return entries
44
+
45
+
46
+ def configuration_entries_to_mapping(entries: list[ConfigurationEntry]) -> dict[str, list[str]]:
47
+ """Convert parsed configuration entries to a name-to-values mapping."""
48
+
49
+ mapping: dict[str, list[str]] = {}
50
+ for entry in entries:
51
+ mapping[entry.name] = entry.values
52
+
53
+ return mapping
plocate/constants.py ADDED
@@ -0,0 +1,3 @@
1
+ """Shared constants."""
2
+
3
+ DEFAULT_DATABASE_PATH = "/var/lib/plocate/plocate.db"
plocate/database.py ADDED
@@ -0,0 +1,284 @@
1
+ """Core plocate.db reader."""
2
+
3
+ import collections.abc
4
+ import os
5
+ import typing
6
+
7
+ import zstandard
8
+
9
+ import plocate.binary_reader
10
+ import plocate.config
11
+ import plocate.directory_data
12
+ import plocate.errors
13
+ import plocate.filename_index
14
+ import plocate.header
15
+ import plocate.indexed_entry
16
+ import plocate.stats
17
+ import plocate.trigram_index
18
+
19
+
20
+
21
+ class PlocateDatabase:
22
+ """Reader for a plocate.db index file."""
23
+
24
+ def __init__(self, file_object: typing.BinaryIO, *, path: str | None = None) -> None:
25
+ """Open a plocate database from a readable binary file object."""
26
+
27
+ self._reader = plocate.binary_reader.BinaryReader(file_object)
28
+ self._path = path
29
+ self._decompressor: zstandard.ZstdDecompressor | None = None
30
+ self._filename_offsets: tuple[int, ...] | None = None
31
+ self._directory_time_entries: tuple[plocate.directory_data.DirectoryTimeEntry, ...] | None = None
32
+ self._directory_time_entries_loaded = False
33
+ self._trigram_index: plocate.trigram_index.TrigramIndex | None = None
34
+ self._trigram_index_loaded = False
35
+
36
+ # Parse the fixed header and prepare decompression.
37
+ header_bytes = self._reader.read_bytes(0, plocate.header.HEADER_STRUCT.size)
38
+ try:
39
+ self.header = plocate.header.PlocateHeader.from_bytes(header_bytes)
40
+ except ValueError as error:
41
+ raise plocate.errors.PlocateFormatError(str(error)) from error
42
+
43
+ dictionary_bytes = self._load_dictionary_bytes()
44
+ self._decompressor = plocate.filename_index.build_zstd_decompressor(dictionary_bytes)
45
+
46
+ @classmethod
47
+ def open(cls, path: str) -> typing.Self:
48
+ """Open a plocate database file from its path."""
49
+
50
+ file_object = open(path, "rb")
51
+ database = cls(file_object, path=path)
52
+
53
+ return database
54
+
55
+ @property
56
+ def path(self) -> str | None:
57
+ """Return the filesystem path passed to open(), if any."""
58
+
59
+ return self._path
60
+
61
+ @property
62
+ def file_size(self) -> int:
63
+ """Return the on-disk size of the database file in bytes."""
64
+
65
+ return self._reader.file_size
66
+
67
+ def file_mtime(self) -> float:
68
+ """Return the filesystem modification time for this open database in seconds."""
69
+
70
+ if self._path is None:
71
+ message = "database file modification time requires a filesystem path"
72
+ raise plocate.errors.PlocateDatabaseError(message)
73
+
74
+ stat_result = os.stat(self._path)
75
+
76
+ return stat_result.st_mtime
77
+
78
+ def close(self) -> None:
79
+ """Close the underlying database file."""
80
+
81
+ self._reader.close()
82
+
83
+ def __enter__(self) -> typing.Self:
84
+ """Enter a context manager that closes on exit."""
85
+
86
+ return self
87
+
88
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
89
+ """Close the database when leaving a context manager."""
90
+
91
+ self.close()
92
+
93
+ def _load_dictionary_bytes(self) -> bytes | None:
94
+ """Return the embedded zstd dictionary bytes, if present."""
95
+
96
+ if self.header.version == 0:
97
+ return None
98
+ if self.header.zstd_dictionary_length_bytes == 0:
99
+ return None
100
+
101
+ dictionary_bytes = self._reader.read_bytes(
102
+ self.header.zstd_dictionary_offset_bytes,
103
+ self.header.zstd_dictionary_length_bytes,
104
+ )
105
+
106
+ return dictionary_bytes
107
+
108
+ def filename_block_offsets(self) -> tuple[int, ...]:
109
+ """Return the cached filename block offset table."""
110
+
111
+ if self._filename_offsets is not None:
112
+ return self._filename_offsets
113
+
114
+ # Read and parse the uint64 offset index.
115
+ index_length = (self.header.num_docids + 1) * 8
116
+ index_bytes = self._reader.read_bytes(
117
+ self.header.filename_index_offset_bytes,
118
+ index_length,
119
+ )
120
+ offsets = plocate.filename_index.read_filename_block_offsets(
121
+ index_bytes,
122
+ self.header.num_docids,
123
+ )
124
+ self._filename_offsets = offsets
125
+
126
+ return self._filename_offsets
127
+
128
+ def read_configuration_block(self) -> list[plocate.config.ConfigurationEntry]:
129
+ """Return updatedb configuration entries stored in the database."""
130
+
131
+ if self.header.max_version < 2 or self.header.conf_block_length_bytes == 0:
132
+ return []
133
+
134
+ block_bytes = self._reader.read_bytes(
135
+ self.header.conf_block_offset_bytes,
136
+ self.header.conf_block_length_bytes,
137
+ )
138
+ entries = plocate.config.parse_configuration_block(block_bytes)
139
+
140
+ return entries
141
+
142
+ def _load_directory_time_entries(
143
+ self,
144
+ ) -> tuple[plocate.directory_data.DirectoryTimeEntry, ...] | None:
145
+ """Return parsed directory timestamp entries, if present."""
146
+
147
+ if self._directory_time_entries_loaded:
148
+ return self._directory_time_entries
149
+
150
+ self._directory_time_entries_loaded = True
151
+ if self.header.max_version < 2:
152
+ return None
153
+ if self.header.directory_data_length_bytes == 0:
154
+ return None
155
+
156
+ # Read and decompress the parallel directory timestamp stream.
157
+ compressed = self._reader.read_bytes(
158
+ self.header.directory_data_offset_bytes,
159
+ self.header.directory_data_length_bytes,
160
+ )
161
+ decompressed = plocate.directory_data.decompress_directory_data_bytes(compressed)
162
+ entries = plocate.directory_data.parse_directory_time_entries(decompressed)
163
+ self._directory_time_entries = entries
164
+
165
+ return self._directory_time_entries
166
+
167
+ def _load_trigram_index(self) -> plocate.trigram_index.TrigramIndex | None:
168
+ """Return the parsed trigram index when present."""
169
+
170
+ if self._trigram_index_loaded:
171
+ return self._trigram_index
172
+
173
+ self._trigram_index_loaded = True
174
+ hash_table_offset = self.header.hash_table_offset_bytes
175
+ hash_table_size = self.header.hashtable_size
176
+ extra_hash_slots = self.header.extra_ht_slots
177
+ entry_count = hash_table_size + extra_hash_slots + 1
178
+ table_length = entry_count * plocate.trigram_index.TRIGRAM_STRUCT.size
179
+ if hash_table_offset + table_length > self.file_size:
180
+ return None
181
+
182
+ # Read and parse the trigram hash table when it is present on disk.
183
+ table_bytes = self._reader.read_bytes(hash_table_offset, table_length)
184
+ table_entries = plocate.trigram_index.parse_trigram_table(table_bytes)
185
+ self._trigram_index = plocate.trigram_index.TrigramIndex(
186
+ self._reader,
187
+ table_entries,
188
+ hash_table_size=hash_table_size,
189
+ extra_hash_slots=extra_hash_slots,
190
+ )
191
+
192
+ return self._trigram_index
193
+
194
+ def has_trigram_index(self) -> bool:
195
+ """Return whether this database contains a readable trigram index."""
196
+
197
+ trigram_index = self._load_trigram_index()
198
+ has_index = trigram_index is not None
199
+
200
+ return has_index
201
+
202
+ def trigram_index(self) -> plocate.trigram_index.TrigramIndex | None:
203
+ """Return the parsed trigram index when present."""
204
+
205
+ return self._load_trigram_index()
206
+
207
+ def read_filename_block(self, docid: int) -> list[str]:
208
+ """Return decompressed paths for one filename block docid."""
209
+
210
+ offsets = self.filename_block_offsets()
211
+ start = offsets[docid]
212
+ end = offsets[docid + 1]
213
+ compressed = self._reader.read_bytes(start, end - start)
214
+ assert self._decompressor is not None
215
+ block_paths = plocate.filename_index.decompress_filename_block(compressed, self._decompressor)
216
+
217
+ return block_paths
218
+
219
+ def iter_filename_blocks(self) -> collections.abc.Iterator[list[str]]:
220
+ """Yield decompressed path lists for each filename block."""
221
+
222
+ offsets = self.filename_block_offsets()
223
+ assert self._decompressor is not None
224
+
225
+ docid_indices = range(self.header.num_docids)
226
+ for docid in docid_indices:
227
+ block_paths = self.read_filename_block(docid)
228
+
229
+ yield block_paths
230
+
231
+ def iter_paths(self) -> collections.abc.Iterator[str]:
232
+ """Yield every indexed path in document order."""
233
+
234
+ blocks = self.iter_filename_blocks()
235
+ for block_paths in blocks:
236
+ for path in block_paths:
237
+ yield path
238
+
239
+ def iter_indexed_entries(self) -> collections.abc.Iterator[plocate.indexed_entry.IndexedEntry]:
240
+ """Yield indexed paths with docid, header, and directory metadata."""
241
+
242
+ directory_time_entries = self._load_directory_time_entries()
243
+ directory_time_index = 0
244
+ docid = 0
245
+
246
+ # Walk filename blocks and pair each path with metadata in order.
247
+ blocks = self.iter_filename_blocks()
248
+ for block_paths in blocks:
249
+ block_index = 0
250
+ for path in block_paths:
251
+ directory_time = None
252
+ if directory_time_entries is not None:
253
+ if directory_time_index >= len(directory_time_entries):
254
+ message = "directory timestamp stream ended before indexed paths"
255
+ raise plocate.errors.PlocateFormatError(message)
256
+ directory_time = directory_time_entries[directory_time_index]
257
+ directory_time_index += 1
258
+
259
+ entry = plocate.indexed_entry.IndexedEntry(
260
+ path=path,
261
+ docid=docid,
262
+ block_index=block_index,
263
+ database_version=self.header.version,
264
+ max_version=self.header.max_version,
265
+ check_visibility=self.header.check_visibility,
266
+ directory_time=directory_time,
267
+ )
268
+ yield entry
269
+ block_index += 1
270
+
271
+ docid += 1
272
+
273
+ if directory_time_entries is not None and directory_time_index != len(directory_time_entries):
274
+ message = "directory timestamp stream has {extra_count} extra entries".format(
275
+ extra_count=len(directory_time_entries) - directory_time_index,
276
+ )
277
+ raise plocate.errors.PlocateFormatError(message)
278
+
279
+ def statistics(self) -> plocate.stats.DatabaseStatistics:
280
+ """Collect summary statistics for this database."""
281
+
282
+ statistics = plocate.stats.collect_statistics(self)
283
+
284
+ return statistics
@@ -0,0 +1,115 @@
1
+ """Directory timestamp stream parsing for plocate databases."""
2
+
3
+ import collections.abc
4
+ import dataclasses
5
+ import io
6
+ import struct
7
+
8
+ import zstandard
9
+
10
+ import plocate.errors
11
+
12
+
13
+ DIRECTORY_TIME_FILE_MARKER = 0
14
+ DIRECTORY_TIME_DIRECTORY_MARKER = 1
15
+ DIRECTORY_TIME_DIRECTORY_BODY_STRUCT = struct.Struct("<qi")
16
+
17
+
18
+ @dataclasses.dataclass(frozen=True, slots=True)
19
+ class DirectoryTimeEntry:
20
+ """Directory timestamp metadata aligned with one indexed path."""
21
+
22
+ is_directory: bool
23
+ seconds: int | None = None
24
+ nanoseconds: int | None = None
25
+
26
+
27
+ def _encode_directory_time_entry(entry: DirectoryTimeEntry) -> bytes:
28
+ """Encode one directory timestamp entry for tests and fixtures."""
29
+
30
+ if not entry.is_directory:
31
+ encoded = bytes([DIRECTORY_TIME_FILE_MARKER])
32
+
33
+ return encoded
34
+ if entry.seconds is None or entry.nanoseconds is None:
35
+ message = "directory entries require seconds and nanoseconds"
36
+ raise ValueError(message)
37
+
38
+ encoded = bytes([DIRECTORY_TIME_DIRECTORY_MARKER])
39
+ encoded += DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.pack(entry.seconds, entry.nanoseconds)
40
+
41
+ return encoded
42
+
43
+
44
+ def _encode_directory_time_block(entries: collections.abc.Sequence[DirectoryTimeEntry]) -> bytes:
45
+ """Encode a directory timestamp block from ordered entries."""
46
+
47
+ block_parts: list[bytes] = []
48
+ for entry in entries:
49
+ encoded_entry = _encode_directory_time_entry(entry)
50
+ block_parts.append(encoded_entry)
51
+ block = b"".join(block_parts)
52
+
53
+ return block
54
+
55
+
56
+ def _compress_directory_time_block(block_bytes: bytes) -> bytes:
57
+ """Compress a directory timestamp block using a zstd stream."""
58
+
59
+ compressor = zstandard.ZstdCompressor()
60
+ buffer = io.BytesIO()
61
+ stream_writer = compressor.stream_writer(buffer)
62
+ stream_writer.write(block_bytes)
63
+ stream_writer.flush(zstandard.FLUSH_FRAME)
64
+ compressed = buffer.getvalue()
65
+ stream_writer.close()
66
+
67
+ return compressed
68
+
69
+
70
+ def decompress_directory_data_bytes(compressed: bytes) -> bytes:
71
+ """Decompress a zstd directory timestamp stream."""
72
+
73
+ decompressor = zstandard.ZstdDecompressor()
74
+ buffer = io.BytesIO(compressed)
75
+ stream_reader = decompressor.stream_reader(buffer)
76
+ decompressed = stream_reader.read()
77
+ stream_reader.close()
78
+
79
+ return decompressed
80
+
81
+
82
+ def parse_directory_time_entries(
83
+ block_bytes: bytes,
84
+ ) -> tuple[DirectoryTimeEntry, ...]:
85
+ """Parse decompressed directory timestamp bytes into ordered entries."""
86
+
87
+ entries: list[DirectoryTimeEntry] = []
88
+ index = 0
89
+
90
+ # Each entry begins with a marker byte for file versus directory.
91
+ while index < len(block_bytes):
92
+ marker = block_bytes[index]
93
+ index += 1
94
+ if marker == DIRECTORY_TIME_FILE_MARKER:
95
+ entry = DirectoryTimeEntry(is_directory=False)
96
+ entries.append(entry)
97
+ continue
98
+ if marker == DIRECTORY_TIME_DIRECTORY_MARKER:
99
+ if index + DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.size > len(block_bytes):
100
+ message = "truncated directory timestamp entry at byte {index}".format(index=index - 1)
101
+ raise plocate.errors.PlocateFormatError(message)
102
+ seconds, nanoseconds = DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.unpack_from(block_bytes, index)
103
+ index += DIRECTORY_TIME_DIRECTORY_BODY_STRUCT.size
104
+ entry = DirectoryTimeEntry(
105
+ is_directory=True,
106
+ seconds=seconds,
107
+ nanoseconds=nanoseconds,
108
+ )
109
+ entries.append(entry)
110
+ continue
111
+
112
+ message = "unsupported directory timestamp marker {marker}".format(marker=marker)
113
+ raise plocate.errors.PlocateFormatError(message)
114
+
115
+ return tuple(entries)
File without changes
@@ -0,0 +1,68 @@
1
+ """Export indexed paths from a plocate database as JSON Lines."""
2
+
3
+ import argparse
4
+ import sys
5
+
6
+ import plocate.constants
7
+ import plocate.database
8
+ import plocate.errors
9
+ import plocate.export
10
+
11
+
12
+
13
+ def _build_parser() -> argparse.ArgumentParser:
14
+ """Build the pl_export argument parser."""
15
+
16
+ parser = argparse.ArgumentParser(description="Export indexed paths from a plocate database as JSON Lines.")
17
+ parser.add_argument(
18
+ "database",
19
+ nargs="?",
20
+ default=plocate.constants.DEFAULT_DATABASE_PATH,
21
+ help="path to plocate.db (default: {default_path})".format(
22
+ default_path=plocate.constants.DEFAULT_DATABASE_PATH,
23
+ ),
24
+ )
25
+ parser.add_argument(
26
+ "--include",
27
+ metavar="PATTERN",
28
+ help="export only paths matching this fnmatch pattern",
29
+ )
30
+
31
+ return parser
32
+
33
+
34
+ def _build_export_options(arguments: argparse.Namespace) -> plocate.export.ExportOptions:
35
+ """Translate parsed CLI arguments into export options."""
36
+
37
+ options = plocate.export.ExportOptions(
38
+ include_pattern=arguments.include,
39
+ )
40
+
41
+ return options
42
+
43
+
44
+ def main(argv: list[str] | None = None) -> None:
45
+ """Parse argv and print indexed paths as JSON Lines."""
46
+
47
+ parser = _build_parser()
48
+ arguments = parser.parse_args(argv)
49
+ options = _build_export_options(arguments)
50
+
51
+ # Open the database and stream matching export records.
52
+ try:
53
+ with plocate.database.PlocateDatabase.open(arguments.database) as database:
54
+ record_iterator = plocate.export.iter_export_records(database, options=options)
55
+ for record in record_iterator:
56
+ line = plocate.export.format_export_record_jsonl(record)
57
+ sys.stdout.write(line)
58
+ except (plocate.errors.PlocateDatabaseError, OSError) as error:
59
+ message = "pl_export: {error}".format(error=error)
60
+ print(message, file=sys.stderr)
61
+
62
+ sys.exit(1)
63
+
64
+ sys.exit(0)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()