dfindexeddb 20240224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # https://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Parser for LevelDB Table (.ldb) files."""
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ import io
20
+ import os
21
+ from typing import BinaryIO, Iterable, Tuple
22
+
23
+ import snappy
24
+
25
+ from dfindexeddb import utils
26
+
27
+
28
+ @dataclass
29
+ class LdbKeyValueRecord:
30
+ """A leveldb table key-value record.
31
+
32
+ Attributes:
33
+ offset: the offset of the record.
34
+ key: the key of the record.
35
+ value: the value of the record.
36
+ sequence_number: the sequence number of the record.
37
+ type: the type of the record.
38
+ """
39
+ offset: int
40
+ key: bytes
41
+ value: bytes
42
+ sequence_number: int
43
+ type: int
44
+
45
+ PACKED_SEQUENCE_AND_TYPE_LENGTH = 8
46
+ SEQUENCE_LENGTH = 7
47
+ TYPE_LENGTH = 1
48
+
49
+ @classmethod
50
+ def FromDecoder(
51
+ cls, decoder: utils.LevelDBDecoder, block_offset: int, shared_key: bytes
52
+ ) -> Tuple[LdbKeyValueRecord, bytes]:
53
+ """Decodes a ldb key value record.
54
+
55
+ Args:
56
+ decoder: the leveldb decoder.
57
+ block_offset: the block offset.
58
+ shared_key: the shared key bytes.
59
+
60
+ Returns:
61
+ A tuple of the parsed LdbKeyValueRecord and the updated shared key bytes.
62
+ """
63
+ offset, shared_bytes = decoder.DecodeUint32Varint()
64
+ _, unshared_bytes = decoder.DecodeUint32Varint()
65
+ _, value_length = decoder.DecodeUint32Varint()
66
+ _, key_delta = decoder.ReadBytes(unshared_bytes)
67
+ _, value = decoder.ReadBytes(value_length)
68
+
69
+ shared_key = shared_key[:shared_bytes] + key_delta
70
+ key = shared_key[:-cls.PACKED_SEQUENCE_AND_TYPE_LENGTH]
71
+ sequence_number = int.from_bytes(
72
+ key[-cls.SEQUENCE_LENGTH:], byteorder='little', signed=False)
73
+ key_type = shared_key[-cls.PACKED_SEQUENCE_AND_TYPE_LENGTH]
74
+
75
+ return cls(offset + block_offset, key, value, sequence_number,
76
+ key_type), shared_key
77
+
78
+
79
+ @dataclass
80
+ class LdbBlock:
81
+ """A leveldb table block.
82
+
83
+ Attributes:
84
+ offset: the offset of the block.
85
+ block_offset:
86
+ """
87
+ offset: int
88
+ block_offset: int
89
+ length: int
90
+ data: bytes = field(repr=False)
91
+ footer: bytes # 5 bytes = 1 byte compressed flag + 4 bytes checksum.
92
+
93
+ COMPRESSED = 1
94
+ RESTART_ENTRY_LENGTH = 4
95
+
96
+ def IsCompressed(self) -> bool:
97
+ """Returns true if the block is compressed."""
98
+ return self.footer[0] == self.COMPRESSED
99
+
100
+ def GetBuffer(self) -> bytes:
101
+ """Returns the block buffer, decompressing if required."""
102
+ if self.IsCompressed():
103
+ return snappy.decompress(self.data)
104
+ return self.data
105
+
106
+ def GetRecords(self) -> Iterable[LdbKeyValueRecord]:
107
+ """Returns an iterator over the key value records in the block.
108
+
109
+ Yields:
110
+ LdbKeyValueRecords
111
+ """
112
+ # get underlying block content, decompressing if required
113
+ buffer = self.GetBuffer()
114
+ decoder = utils.LevelDBDecoder(io.BytesIO(buffer))
115
+
116
+ # trailer of a block has the form:
117
+ # restarts: uint32[num_restarts]
118
+ # num_restarts: uint32
119
+ decoder.stream.seek(-self.RESTART_ENTRY_LENGTH, os.SEEK_END)
120
+ _, num_restarts = decoder.DecodeUint32()
121
+ restarts_offset = (
122
+ decoder.stream.tell()) - (num_restarts + 1) * self.RESTART_ENTRY_LENGTH
123
+
124
+ decoder.stream.seek(restarts_offset)
125
+ _, offset = decoder.DecodeUint32()
126
+ decoder.stream.seek(offset)
127
+ key = b''
128
+
129
+ while decoder.stream.tell() < restarts_offset:
130
+ key_value_record, key = LdbKeyValueRecord.FromDecoder(
131
+ decoder, self.block_offset, key)
132
+ yield key_value_record
133
+
134
+ # TODO: parse trailer of the block for restart points (where the full
135
+ # key is stored to allow for binary lookup). It's not needed at this time
136
+ # since we are sequentially iterating over the records in the block/file.
137
+
138
+
139
+ @dataclass
140
+ class BlockHandle:
141
+ """A handle to a block in the ldb file.
142
+
143
+ Attributes:
144
+ offset: the offset of the block handle.
145
+ block_offset: the offset of the block.
146
+ length: the length of the block.
147
+ """
148
+ offset: int
149
+ block_offset: int
150
+ length: int
151
+
152
+ BLOCK_TRAILER_SIZE = 5
153
+
154
+ def Load(self, stream: BinaryIO) -> LdbBlock:
155
+ """Loads the block data.
156
+
157
+ Args:
158
+ stream: the binary stream of the ldb file.
159
+
160
+ Returns:
161
+ a LdbBlock.
162
+
163
+ Raises:
164
+ ValueError: if it could not read all of the block or block footer.
165
+ """
166
+ stream.seek(self.block_offset, os.SEEK_SET)
167
+ data = stream.read(self.length)
168
+ if len(data) != self.length:
169
+ raise ValueError('Could not read all of the block')
170
+
171
+ footer = stream.read(self.BLOCK_TRAILER_SIZE)
172
+ if len(footer) != self.BLOCK_TRAILER_SIZE:
173
+ raise ValueError('Could not read all of the block footer')
174
+
175
+ return LdbBlock(self.offset, self.block_offset, self.length, data, footer)
176
+
177
+ @classmethod
178
+ def FromStream(cls, stream: BinaryIO, base_offset: int = 0) -> BlockHandle:
179
+ """Reads a block handle from a binary stream.
180
+
181
+ Args:
182
+ stream: the binary stream.
183
+ base_offset: the base offset.
184
+
185
+ Returns:
186
+ A BlockHandle.
187
+ """
188
+ decoder = utils.LevelDBDecoder(stream)
189
+ offset, block_offset = decoder.DecodeUint64Varint()
190
+ _, length = decoder.DecodeUint64Varint()
191
+ return cls(offset + base_offset, block_offset, length)
192
+
193
+
194
+ class LdbFileReader:
195
+ """A leveldb table (.ldb or .sst) file reader.
196
+
197
+ A LdbFileReader provides read-only sequential iteration of serialized
198
+ structures in a leveldb ldb file. These structures include:
199
+ * blocks (LdbBlock)
200
+ * records (LdbKeyValueRecord)
201
+ """
202
+
203
+ FOOTER_SIZE = 48
204
+ MAGIC = b'\x57\xfb\x80\x8b\x24\x75\x47\xdb'
205
+
206
+ def __init__(self, filename: str):
207
+ """Initializes the LogFile.
208
+
209
+ Args:
210
+ filename: the .ldb filename.
211
+
212
+ Raises:
213
+ ValueError if the file has an invalid magic number at the end.
214
+ """
215
+ self.filename = filename
216
+ with open(self.filename, 'rb') as fh:
217
+ fh.seek(-len(self.MAGIC), os.SEEK_END)
218
+ if fh.read(len(self.MAGIC)) != self.MAGIC:
219
+ raise ValueError(f'Invalid magic number in {self.filename}')
220
+
221
+ fh.seek(-self.FOOTER_SIZE, os.SEEK_END)
222
+ # meta_handle, need to read first due to variable integers
223
+ _ = BlockHandle.FromStream(fh)
224
+ index_handle = BlockHandle.FromStream(fh)
225
+
226
+ # self.meta_block = meta_handle.load(fh) # TODO: support meta blocks
227
+ self.index_block = index_handle.Load(fh)
228
+
229
+ def GetBlocks(self) -> Iterable[LdbBlock]:
230
+ """Returns an iterator of LdbBlocks.
231
+
232
+ Yields:
233
+ LdbBlock.
234
+ """
235
+ with open(self.filename, 'rb') as fh:
236
+ for key_value_record in self.index_block.GetRecords():
237
+ block_handle = BlockHandle.FromStream(
238
+ io.BytesIO(key_value_record.value),
239
+ base_offset=key_value_record.offset)
240
+ yield block_handle.Load(fh)
241
+
242
+ def GetKeyValueRecords(self) -> Iterable[LdbKeyValueRecord]:
243
+ """Returns an iterator of LdbKeyValueRecords.
244
+
245
+ Yields:
246
+ LdbKeyValueRecords.
247
+ """
248
+ for block in self.GetBlocks():
249
+ for record in block.GetRecords():
250
+ yield record
251
+
252
+ def RangeIter(self) -> Iterable[Tuple[bytes, bytes]]: #pylint: disable=C0103
253
+ """Returns an iterator of key-value pairs.
254
+
255
+ Yields:
256
+ A tuple of key and value as bytes.
257
+ """
258
+ for record in self.GetKeyValueRecords():
259
+ yield (record.key, record.value)
@@ -0,0 +1,308 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # https://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Parser for LevelDB Log (.log) files."""
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ from enum import IntEnum
20
+ import io
21
+ from typing import BinaryIO, Generator, Iterable, Optional
22
+
23
+ from dfindexeddb import errors
24
+ from dfindexeddb import utils
25
+
26
+
27
+ class LogFilePhysicalRecordType(IntEnum):
28
+ """LevelDB log file physical record types."""
29
+ FULL = 1
30
+ FIRST = 2
31
+ MIDDLE = 3
32
+ LAST = 4
33
+
34
+
35
+ @dataclass
36
+ class ParsedInternalKey:
37
+ """An internal key record from a leveldb log file.
38
+
39
+ Attributes:
40
+ offset: the offset of the record.
41
+ type: the record type.
42
+ key: the record key.
43
+ value: the record value.
44
+ """
45
+ offset: int
46
+ type: int
47
+ key: bytes
48
+ value: bytes
49
+
50
+ @classmethod
51
+ def FromDecoder(
52
+ cls,
53
+ decoder: utils.LevelDBDecoder,
54
+ base_offset: int = 0
55
+ ) -> ParsedInternalKey:
56
+ """Decodes an internal key value record.
57
+
58
+ Args:
59
+ decoder: the leveldb decoder.
60
+ base_offset: the base offset for the parsed key value record.
61
+
62
+ Returns:
63
+ a ParsedInternalKey
64
+
65
+ Raises:
66
+ ValueError: if there is an invalid record type encountered.
67
+ """
68
+ offset, record_type = decoder.DecodeUint8()
69
+ _, key = decoder.DecodeBlobWithLength()
70
+ if record_type == 1:
71
+ _, value = decoder.DecodeBlobWithLength()
72
+ elif record_type == 0:
73
+ value = b''
74
+ else:
75
+ raise ValueError(f'Invalid record type {record_type}')
76
+ return cls(base_offset + offset, record_type, key, value)
77
+
78
+
79
+ @dataclass
80
+ class WriteBatch:
81
+ """A write batch from a leveldb log file.
82
+
83
+ Attributes:
84
+ offset: the batch offset.
85
+ sequence_number: the batch sequence number.
86
+ count: the number of ParsedInternalKey in the batch.
87
+ records: the ParsedInternalKey parsed from the batch.
88
+ """
89
+ offset: int
90
+ sequence_number: int
91
+ count: int
92
+ records: Iterable[ParsedInternalKey] = field(repr=False)
93
+
94
+ @classmethod
95
+ def FromStream(
96
+ cls, stream: BinaryIO, base_offset: int = 0
97
+ ) -> WriteBatch:
98
+ """Parses a WriteBatch from a binary stream.
99
+
100
+ Args:
101
+ stream: the binary stream to be parsed.
102
+ base_offset: the base offset of the Block from which the data is
103
+ read from.
104
+
105
+ Returns:
106
+ A WriteBatch.
107
+ """
108
+ decoder = utils.LevelDBDecoder(stream)
109
+ _, sequence_number = decoder.DecodeUint64()
110
+ _, count = decoder.DecodeUint32()
111
+
112
+ records = []
113
+ for _ in range(count):
114
+ record = ParsedInternalKey.FromDecoder(decoder, base_offset)
115
+ records.append(record)
116
+ return cls(base_offset, sequence_number, count, records)
117
+
118
+ @classmethod
119
+ def FromBytes(cls, data: bytes, base_offset: int = 0) -> WriteBatch:
120
+ """Parses a WriteBatch from bytes.
121
+
122
+ Args:
123
+ data: the bytes to be parsed.
124
+ base_offset: the base offset of the Block from which the data is
125
+ read from.
126
+
127
+ Returns:
128
+ A WriteBatch.
129
+ """
130
+ return cls.FromStream(io.BytesIO(data), base_offset)
131
+
132
+
133
+ @dataclass
134
+ class PhysicalRecord:
135
+ """A physical record from a leveldb log file.
136
+
137
+ Attributes:
138
+ offset: the record offset.
139
+ checksum: the record checksum.
140
+ length: the length of the record in bytes.
141
+ type: the record type.
142
+ contents: the record contents.
143
+ contents_offset: the offset of where the record contents are stored.
144
+ """
145
+ base_offset: int
146
+ offset: int
147
+ checksum: int
148
+ length: int
149
+ record_type: LogFilePhysicalRecordType
150
+ contents: bytes = field(repr=False)
151
+ contents_offset: int
152
+
153
+ @classmethod
154
+ def FromStream(
155
+ cls, stream: BinaryIO, base_offset: int = 0) -> PhysicalRecord:
156
+ """Parses a PhysicalRecord from a binary stream.
157
+
158
+ Args:
159
+ stream: the binary stream to be parsed.
160
+ base_offset: the base offset of the WriteBatch from which the data is
161
+ read from.
162
+
163
+ Returns:
164
+ A PhysicalRecord.
165
+ """
166
+ decoder = utils.StreamDecoder(stream)
167
+ offset, checksum = decoder.DecodeUint32()
168
+ _, length = decoder.DecodeUint16()
169
+ record_type = LogFilePhysicalRecordType(decoder.DecodeUint8()[1])
170
+ contents_offset, contents = decoder.ReadBytes(length)
171
+ return cls(
172
+ base_offset=base_offset,
173
+ offset=offset,
174
+ checksum=checksum,
175
+ length=length,
176
+ record_type=record_type,
177
+ contents=contents,
178
+ contents_offset=contents_offset)
179
+
180
+
181
+ @dataclass
182
+ class Block:
183
+ """A block from a leveldb log file.
184
+
185
+ Attributes:
186
+ offset: the block offset.
187
+ data: the block data.
188
+ """
189
+ offset: int
190
+ data: bytes = field(repr=False)
191
+
192
+ BLOCK_SIZE = 32768
193
+
194
+ def GetPhysicalRecords(self) -> Generator[PhysicalRecord, None, None]:
195
+ """Returns a generator of LogFilePhysicalRecords from the block contents.
196
+
197
+ Yields:
198
+ LogFileRecord
199
+ """
200
+ buffer = io.BytesIO(self.data)
201
+ while True:
202
+ try:
203
+ yield PhysicalRecord.FromStream(buffer, base_offset=self.offset)
204
+ except errors.DecoderError:
205
+ return
206
+
207
+ @classmethod
208
+ def FromStream(cls, stream: BinaryIO) -> Optional[Block]:
209
+ """Parses a Block from a binary stream.
210
+
211
+ Args:
212
+ stream: the binary stream to be parsed.
213
+
214
+ Returns:
215
+ the Block or None if there is no data to read from the stream."""
216
+ offset = stream.tell()
217
+ data = stream.read(cls.BLOCK_SIZE) # reads full and partial blocks
218
+ if not data:
219
+ return None
220
+ return cls(offset, data)
221
+
222
+
223
+ class LogFileReader:
224
+ """A leveldb log file reader.
225
+
226
+ A LogFileReader provides read-only sequential iteration of serialized
227
+ structures in a leveldb logfile. These structures include:
228
+ * blocks (Block)
229
+ * phyiscal records (PhysicalRecord)
230
+ * batches (WriteBatch) and
231
+ * key/value records (ParsedInternalKey).
232
+
233
+ Attributes:
234
+ filename (str): the leveldb log filename.
235
+ """
236
+
237
+ def __init__(self, filename: str):
238
+ """Initializes the LogFile.
239
+
240
+ Args:
241
+ filename: the leveldb log filename
242
+ """
243
+ self.filename = filename
244
+
245
+ def GetBlocks(self) -> Generator[Block, None, None]:
246
+ """Returns an iterator of Block instances.
247
+
248
+ A logfile is composed of one or more blocks.
249
+
250
+ Yields:
251
+ a Block
252
+ """
253
+ with open(self.filename, 'rb') as fh:
254
+ while True:
255
+ block = Block.FromStream(fh)
256
+ if not block:
257
+ break
258
+ yield block
259
+
260
+ def GetPhysicalRecords(self) -> Generator[PhysicalRecord, None, None]:
261
+ """Returns an iterator of PhysicalRecord instances.
262
+
263
+ A block is composed of one or more physical records.
264
+
265
+ Yields:
266
+ PhysicalRecord
267
+ """
268
+ for block in self.GetBlocks():
269
+ for physical_record in block.GetPhysicalRecords():
270
+ yield physical_record
271
+
272
+ def GetWriteBatches(self) -> Generator[WriteBatch, None, None]:
273
+ """Returns an iterator of WriteBatch instances.
274
+
275
+ Depending on the batch size, a log file batch can be spread across one or
276
+ more physical records.
277
+
278
+ Yields:
279
+ WriteBatch
280
+ """
281
+ buffer = bytearray()
282
+ for physical_record in self.GetPhysicalRecords():
283
+ if physical_record.record_type == LogFilePhysicalRecordType.FULL:
284
+ buffer = physical_record.contents
285
+ offset = physical_record.contents_offset + physical_record.base_offset
286
+ yield WriteBatch.FromBytes(buffer, base_offset=offset)
287
+ buffer = bytearray()
288
+ elif physical_record.record_type == LogFilePhysicalRecordType.FIRST:
289
+ offset = physical_record.contents_offset + physical_record.base_offset
290
+ buffer = bytearray(physical_record.contents)
291
+ elif physical_record.record_type == LogFilePhysicalRecordType.MIDDLE:
292
+ buffer.extend(bytearray(physical_record.contents))
293
+ elif physical_record.record_type == LogFilePhysicalRecordType.LAST:
294
+ buffer.extend(bytearray(physical_record.contents))
295
+ yield WriteBatch.FromBytes(buffer, base_offset=offset)
296
+ buffer = bytearray()
297
+
298
+ def GetKeyValueRecords(self) -> Generator[ParsedInternalKey, None, None]:
299
+ """Returns an iterator of KeyValueRecord instances.
300
+
301
+ A batch can contain on or more key value records.
302
+
303
+ Yields:
304
+ KeyValueRecord
305
+ """
306
+ for batch in self.GetWriteBatches():
307
+ for record in batch.records:
308
+ yield record