dfindexeddb 20240301__py3-none-any.whl → 20240324__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,20 +15,13 @@
15
15
  """Parser for LevelDB Log (.log) files."""
16
16
  from __future__ import annotations
17
17
 
18
- from dataclasses import dataclass, field
19
- from enum import IntEnum
18
+ from dataclasses import dataclass
20
19
  import io
21
20
  from typing import BinaryIO, Generator, Iterable, Optional
22
21
 
23
- from dfindexeddb import utils
24
-
25
-
26
- class LogFilePhysicalRecordType(IntEnum):
27
- """LevelDB log file physical record types."""
28
- FULL = 1
29
- FIRST = 2
30
- MIDDLE = 3
31
- LAST = 4
22
+ from dfindexeddb import errors
23
+ from dfindexeddb.leveldb import definitions
24
+ from dfindexeddb.leveldb import utils
32
25
 
33
26
 
34
27
  @dataclass
@@ -37,12 +30,15 @@ class ParsedInternalKey:
37
30
 
38
31
  Attributes:
39
32
  offset: the offset of the record.
40
- type: the record type.
33
+ record_type: the record type.
34
+ sequence_number: the sequence number (inferred from the relative location
35
+ the ParsedInternalKey in a WriteBatch.)
41
36
  key: the record key.
42
37
  value: the record value.
43
38
  """
44
39
  offset: int
45
- type: int
40
+ record_type: definitions.InternalRecordType
41
+ sequence_number: int
46
42
  key: bytes
47
43
  value: bytes
48
44
 
@@ -50,37 +46,47 @@ class ParsedInternalKey:
50
46
  def FromDecoder(
51
47
  cls,
52
48
  decoder: utils.LevelDBDecoder,
53
- base_offset: int = 0
49
+ base_offset: int = 0,
50
+ sequence_number: int = 0,
54
51
  ) -> ParsedInternalKey:
55
52
  """Decodes an internal key value record.
56
53
 
57
54
  Args:
58
55
  decoder: the leveldb decoder.
59
- base_offset: the base offset for the parsed key value record.
56
+ base_offset: the base offset for the parsed internal key value record.
57
+ sequence_number: the sequence number for the parsed internal key value
58
+ record.
60
59
 
61
60
  Returns:
62
- a ParsedInternalKey
61
+ A ParsedInternalKey
63
62
 
64
63
  Raises:
65
64
  ValueError: if there is an invalid record type encountered.
66
65
  """
67
66
  offset, record_type = decoder.DecodeUint8()
68
67
  _, key = decoder.DecodeBlobWithLength()
69
- if record_type == 1:
68
+ record_type = definitions.InternalRecordType(record_type)
69
+
70
+ if record_type == definitions.InternalRecordType.VALUE:
70
71
  _, value = decoder.DecodeBlobWithLength()
71
- elif record_type == 0:
72
+ elif record_type == definitions.InternalRecordType.DELETED:
72
73
  value = b''
73
74
  else:
74
75
  raise ValueError(f'Invalid record type {record_type}')
75
- return cls(base_offset + offset, record_type, key, value)
76
+ return cls(
77
+ offset=base_offset + offset,
78
+ record_type=record_type,
79
+ key=key,
80
+ value=value,
81
+ sequence_number=sequence_number)
76
82
 
77
83
 
78
84
  @dataclass
79
- class WriteBatch:
85
+ class WriteBatch(utils.FromDecoderMixin):
80
86
  """A write batch from a leveldb log file.
81
87
 
82
88
  Attributes:
83
- offset: the batch offset.
89
+ offset: the write batch offset.
84
90
  sequence_number: the batch sequence number.
85
91
  count: the number of ParsedInternalKey in the batch.
86
92
  records: the ParsedInternalKey parsed from the batch.
@@ -88,49 +94,41 @@ class WriteBatch:
88
94
  offset: int
89
95
  sequence_number: int
90
96
  count: int
91
- records: Iterable[ParsedInternalKey] = field(repr=False)
97
+ records: Iterable[ParsedInternalKey]
92
98
 
93
99
  @classmethod
94
- def FromStream(
95
- cls, stream: BinaryIO, base_offset: int = 0
100
+ def FromDecoder(
101
+ cls, decoder: utils.LevelDBDecoder, base_offset: int = 0
96
102
  ) -> WriteBatch:
97
103
  """Parses a WriteBatch from a binary stream.
98
104
 
99
105
  Args:
100
- stream: the binary stream to be parsed.
106
+ decoder: the LevelDBDecoder
101
107
  base_offset: the base offset of the Block from which the data is
102
108
  read from.
103
109
 
104
110
  Returns:
105
111
  A WriteBatch.
106
112
  """
107
- decoder = utils.LevelDBDecoder(stream)
108
- _, sequence_number = decoder.DecodeUint64()
113
+ offset, sequence_number = decoder.DecodeUint64()
109
114
  _, count = decoder.DecodeUint32()
110
115
 
111
116
  records = []
112
- for _ in range(count):
113
- record = ParsedInternalKey.FromDecoder(decoder, base_offset)
117
+ for relative_sequence_number in range(count):
118
+ record = ParsedInternalKey.FromDecoder(
119
+ decoder, base_offset + offset,
120
+ relative_sequence_number + sequence_number
121
+ )
114
122
  records.append(record)
115
- return cls(base_offset, sequence_number, count, records)
116
-
117
- @classmethod
118
- def FromBytes(cls, data: bytes, base_offset: int = 0) -> WriteBatch:
119
- """Parses a WriteBatch from bytes.
120
-
121
- Args:
122
- data: the bytes to be parsed.
123
- base_offset: the base offset of the Block from which the data is
124
- read from.
125
-
126
- Returns:
127
- A WriteBatch.
128
- """
129
- return cls.FromStream(io.BytesIO(data), base_offset)
123
+ return cls(
124
+ offset=base_offset + offset,
125
+ sequence_number=sequence_number,
126
+ count=count,
127
+ records=records)
130
128
 
131
129
 
132
130
  @dataclass
133
- class PhysicalRecord:
131
+ class PhysicalRecord(utils.FromDecoderMixin):
134
132
  """A physical record from a leveldb log file.
135
133
 
136
134
  Attributes:
@@ -145,27 +143,35 @@ class PhysicalRecord:
145
143
  offset: int
146
144
  checksum: int
147
145
  length: int
148
- record_type: LogFilePhysicalRecordType
149
- contents: bytes = field(repr=False)
146
+ record_type: definitions.LogFilePhysicalRecordType
147
+ contents: bytes
150
148
  contents_offset: int
151
149
 
150
+ PHYSICAL_HEADER_LENGTH = 7
151
+
152
152
  @classmethod
153
- def FromStream(
154
- cls, stream: BinaryIO, base_offset: int = 0) -> PhysicalRecord:
155
- """Parses a PhysicalRecord from a binary stream.
153
+ def FromDecoder(
154
+ cls, decoder: utils.LevelDBDecoder, base_offset: int = 0
155
+ ) -> PhysicalRecord:
156
+ """Decodes a PhysicalRecord from the current position of a LevelDBDecoder.
156
157
 
157
158
  Args:
158
- stream: the binary stream to be parsed.
159
+ decoder: the LevelDBDecoder.
159
160
  base_offset: the base offset of the WriteBatch from which the data is
160
161
  read from.
161
162
 
162
163
  Returns:
163
164
  A PhysicalRecord.
164
165
  """
165
- decoder = utils.StreamDecoder(stream)
166
166
  offset, checksum = decoder.DecodeUint32()
167
167
  _, length = decoder.DecodeUint16()
168
- record_type = LogFilePhysicalRecordType(decoder.DecodeUint8()[1])
168
+ _, record_type_byte = decoder.DecodeUint8()
169
+ try:
170
+ record_type = definitions.LogFilePhysicalRecordType(record_type_byte)
171
+ except ValueError as error:
172
+ raise errors.ParserError(
173
+ f'Error parsing record type of Physical Record at offset '
174
+ f'{offset + base_offset}') from error
169
175
  contents_offset, contents = decoder.ReadBytes(length)
170
176
  return cls(
171
177
  base_offset=base_offset,
@@ -186,7 +192,7 @@ class Block:
186
192
  data: the block data.
187
193
  """
188
194
  offset: int
189
- data: bytes = field(repr=False)
195
+ data: bytes
190
196
 
191
197
  BLOCK_SIZE = 32768
192
198
 
@@ -199,7 +205,7 @@ class Block:
199
205
  buffer = io.BytesIO(self.data)
200
206
  buffer_length = len(self.data)
201
207
 
202
- while buffer.tell() < buffer_length:
208
+ while buffer.tell() + PhysicalRecord.PHYSICAL_HEADER_LENGTH < buffer_length:
203
209
  yield PhysicalRecord.FromStream(buffer, base_offset=self.offset)
204
210
 
205
211
  @classmethod
@@ -219,10 +225,10 @@ class Block:
219
225
  return cls(offset, data)
220
226
 
221
227
 
222
- class LogFileReader:
228
+ class FileReader:
223
229
  """A leveldb log file reader.
224
230
 
225
- A LogFileReader provides read-only sequential iteration of serialized
231
+ A Log FileReader provides read-only sequential iteration of serialized
226
232
  structures in a leveldb logfile. These structures include:
227
233
  * blocks (Block)
228
234
  * phyiscal records (PhysicalRecord)
@@ -250,11 +256,10 @@ class LogFileReader:
250
256
  a Block
251
257
  """
252
258
  with open(self.filename, 'rb') as fh:
253
- while True:
254
- block = Block.FromStream(fh)
255
- if not block:
256
- break
259
+ block = Block.FromStream(fh)
260
+ while block:
257
261
  yield block
262
+ block = Block.FromStream(fh)
258
263
 
259
264
  def GetPhysicalRecords(self) -> Generator[PhysicalRecord, None, None]:
260
265
  """Returns an iterator of PhysicalRecord instances.
@@ -278,28 +283,32 @@ class LogFileReader:
278
283
  """
279
284
  buffer = bytearray()
280
285
  for physical_record in self.GetPhysicalRecords():
281
- if physical_record.record_type == LogFilePhysicalRecordType.FULL:
286
+ if (physical_record.record_type ==
287
+ definitions.LogFilePhysicalRecordType.FULL):
282
288
  buffer = physical_record.contents
283
289
  offset = physical_record.contents_offset + physical_record.base_offset
284
290
  yield WriteBatch.FromBytes(buffer, base_offset=offset)
285
291
  buffer = bytearray()
286
- elif physical_record.record_type == LogFilePhysicalRecordType.FIRST:
292
+ elif (physical_record.record_type
293
+ == definitions.LogFilePhysicalRecordType.FIRST):
287
294
  offset = physical_record.contents_offset + physical_record.base_offset
288
295
  buffer = bytearray(physical_record.contents)
289
- elif physical_record.record_type == LogFilePhysicalRecordType.MIDDLE:
296
+ elif (physical_record.record_type ==
297
+ definitions.LogFilePhysicalRecordType.MIDDLE):
290
298
  buffer.extend(bytearray(physical_record.contents))
291
- elif physical_record.record_type == LogFilePhysicalRecordType.LAST:
299
+ elif (physical_record.record_type ==
300
+ definitions.LogFilePhysicalRecordType.LAST):
292
301
  buffer.extend(bytearray(physical_record.contents))
293
302
  yield WriteBatch.FromBytes(buffer, base_offset=offset)
294
303
  buffer = bytearray()
295
304
 
296
- def GetKeyValueRecords(self) -> Generator[ParsedInternalKey, None, None]:
297
- """Returns an iterator of KeyValueRecord instances.
305
+ def GetParsedInternalKeys(self) -> Generator[ParsedInternalKey, None, None]:
306
+ """Returns an iterator of ParsedInternalKey instances.
298
307
 
299
- A batch can contain on or more key value records.
308
+ A batch can contain one or more key value records.
300
309
 
301
310
  Yields:
302
- KeyValueRecord
311
+ ParsedInternalKey
303
312
  """
304
313
  for batch in self.GetWriteBatches():
305
314
  yield from batch.records
@@ -0,0 +1,102 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # https://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """A module for records from LevelDB files."""
16
+ from __future__ import annotations
17
+ import dataclasses
18
+ import pathlib
19
+ import sys
20
+ from typing import Any, Generator, Union
21
+
22
+ from dfindexeddb.leveldb import descriptor
23
+ from dfindexeddb.leveldb import ldb
24
+ from dfindexeddb.leveldb import log
25
+
26
+
27
+ @dataclasses.dataclass
28
+ class LevelDBRecord:
29
+ """A leveldb record.
30
+
31
+ A record can come from a log file, a table (ldb) file or a descriptor
32
+ (MANIFEST) file.
33
+
34
+ Attributes:
35
+ path: the file path where the record was parsed from.
36
+ record: the leveldb record.
37
+ """
38
+ path: str
39
+ record: Union[
40
+ ldb.KeyValueRecord,
41
+ log.ParsedInternalKey,
42
+ descriptor.VersionEdit]
43
+
44
+ @classmethod
45
+ def FromFile(
46
+ cls,
47
+ file_path: pathlib.Path,
48
+ include_versionedit: bool = False
49
+ ) -> Generator[LevelDBRecord, Any, Any]:
50
+ """Yields leveldb records from the given path.
51
+
52
+ Yields:
53
+ LevelDBRecords
54
+
55
+ Args:
56
+ file_path: the file path.
57
+ include_versionedit: include VersionEdit records from descriptor files.
58
+ """
59
+ if file_path.name.endswith('.log'):
60
+ for record in log.FileReader(
61
+ file_path.as_posix()).GetParsedInternalKeys():
62
+ yield cls(path=file_path.as_posix(), record=record)
63
+ elif file_path.name.endswith('.ldb'):
64
+ for record in ldb.FileReader(file_path.as_posix()).GetKeyValueRecords():
65
+ yield cls(path=file_path.as_posix(), record=record)
66
+ elif file_path.name.startswith('MANIFEST'):
67
+ if not include_versionedit:
68
+ print(f'Ignoring {file_path.as_posix()}', file=sys.stderr)
69
+ return
70
+ for record in descriptor.FileReader(
71
+ file_path.as_posix()).GetVersionEdits():
72
+ yield cls(path=file_path.as_posix(), record=record)
73
+ elif file_path.name in ('LOCK', 'CURRENT', 'LOG', 'LOG.old'):
74
+ print(f'Ignoring {file_path.as_posix()}', file=sys.stderr)
75
+ else:
76
+ print(f'Unsupported file type {file_path.as_posix()}', file=sys.stderr)
77
+
78
+ @classmethod
79
+ def FromDir(
80
+ cls,
81
+ path: pathlib.Path,
82
+ include_versionedit: bool = False
83
+ ) -> Generator[LevelDBRecord, Any, Any]:
84
+ """Yields LevelDBRecords from the given directory.
85
+
86
+ Args:
87
+ path: the file path.
88
+ include_versionedit: include VersionEdit records from descriptor files.
89
+
90
+ Yields:
91
+ LevelDBRecords
92
+
93
+ Raises:
94
+ ValueError: if path is not a directory.
95
+ """
96
+ if path.is_dir():
97
+ for file_path in path.iterdir():
98
+ yield from cls.FromFile(
99
+ file_path=file_path,
100
+ include_versionedit=include_versionedit)
101
+ else:
102
+ raise ValueError(f'{path} is not a directory')
@@ -0,0 +1,116 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright 2024 Google LLC
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # https://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Helper/utility classes for LevelDB."""
16
+ from __future__ import annotations
17
+ import io
18
+ from typing import BinaryIO, Tuple, Type, TypeVar
19
+
20
+ from dfindexeddb import errors
21
+ from dfindexeddb import utils
22
+
23
+
24
+ class LevelDBDecoder(utils.StreamDecoder):
25
+ """A helper class to decode data types from LevelDB files."""
26
+
27
+ def DecodeBool(self) -> Tuple[int, bool]:
28
+ """Returns a Tuple of the offset of decoding and the bool value."""
29
+ offset, buffer = self.ReadBytes(1)
30
+ return offset, buffer[0] is not None
31
+
32
+ def DecodeString(self) -> Tuple[int, str]:
33
+ """Returns a tuple of the offset of decoding and the string value.
34
+
35
+ Raises:
36
+ errors.DecoderError: when the parsed string buffer is not even (i.e.
37
+ cannot be decoded as a UTF-16-BE string.
38
+ """
39
+ offset = self.stream.tell()
40
+ buffer = self.stream.read()
41
+ if len(buffer) % 2:
42
+ raise errors.DecoderError(
43
+ f'Odd number of bytes encountered at offset {offset}')
44
+ return offset, buffer.decode('utf-16-be')
45
+
46
+ def DecodeLengthPrefixedSlice(self) -> Tuple[int, bytes]:
47
+ """Returns a tuple of the offset of decoding and the byte 'slice'."""
48
+ offset, num_bytes = self.DecodeUint32Varint()
49
+ _, blob = self.ReadBytes(num_bytes)
50
+ return offset, blob
51
+
52
+ def DecodeBlobWithLength(self) -> Tuple[int, bytes]:
53
+ """Returns a tuple of a the offset of decoding and the binary blob."""
54
+ offset, num_bytes = self.DecodeUint64Varint()
55
+ _, blob = self.ReadBytes(num_bytes)
56
+ return offset, blob
57
+
58
+ def DecodeStringWithLength(self, encoding='utf-16-be') -> Tuple[int, str]:
59
+ """Returns a tuple of the offset of decoding and the string value."""
60
+ offset, length = self.DecodeUint64Varint()
61
+ _, buffer = self.ReadBytes(length*2)
62
+ return offset, buffer.decode(encoding=encoding)
63
+
64
+
65
+ T = TypeVar('T')
66
+
67
+
68
+ class FromDecoderMixin:
69
+ """A mixin for parsing dataclass attributes using a LevelDBDecoder."""
70
+
71
+ @classmethod
72
+ def FromDecoder(
73
+ cls: Type[T], decoder: LevelDBDecoder, base_offset: int = 0) -> T:
74
+ """Decodes a class type from the current position of a LevelDBDecoder.
75
+
76
+ Args:
77
+ decoder: the LevelDBDecoder.
78
+ base_offset: the base offset.
79
+
80
+ Returns:
81
+ The class instance.
82
+
83
+ Raises:
84
+ NotImplementedError if the child class does not implement this method.
85
+ """
86
+ raise NotImplementedError
87
+
88
+ @classmethod
89
+ def FromStream(
90
+ cls: Type[T], stream: BinaryIO, base_offset: int = 0) -> T:
91
+ """Decodes a class type from the current position of a binary stream.
92
+
93
+ Args:
94
+ stream: the binary stream.
95
+ base_offset: the base offset of the binary stream.
96
+
97
+ Returns:
98
+ The class instance.
99
+ """
100
+ decoder = LevelDBDecoder(stream)
101
+ return cls.FromDecoder(decoder=decoder, base_offset=base_offset)
102
+
103
+ @classmethod
104
+ def FromBytes(
105
+ cls: Type[T], raw_data: bytes, base_offset: int = 0) -> T:
106
+ """Parses a class type from raw bytes.
107
+
108
+ Args:
109
+ raw_data: the raw data.
110
+ base_offset: the base offset of the raw data.
111
+
112
+ Returns:
113
+ The class instance.
114
+ """
115
+ stream = io.BytesIO(raw_data)
116
+ return cls.FromStream(stream=stream, base_offset=base_offset)
dfindexeddb/utils.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  """Utilities for dfindexeddb."""
16
+ from __future__ import annotations
16
17
  import io
17
18
  import os
18
19
  import struct
@@ -206,55 +207,19 @@ class StreamDecoder:
206
207
  return self.DecodeZigzagVarint(max_bytes=10)
207
208
 
208
209
 
209
-
210
- class LevelDBDecoder(StreamDecoder):
211
- """A helper class to decode data types from LevelDB files."""
212
-
213
- def DecodeBool(self) -> Tuple[int, bool]:
214
- """Returns a Tuple of the offset of decoding and the bool value."""
215
- offset, buffer = self.ReadBytes(1)
216
- return offset, buffer[0] is not None
217
-
218
- def DecodeString(self) -> Tuple[int, str]:
219
- """Returns a tuple of the offset of decoding and the string value.
220
-
221
- Raises:
222
- errors.DecoderError: when the parsed string buffer is not even (i.e.
223
- cannot be decoded as a UTF-16-BE string.
224
- """
225
- offset = self.stream.tell()
226
- buffer = self.stream.read()
227
- if len(buffer) % 2:
228
- raise errors.DecoderError(
229
- f'Odd number of bytes encountered at offset {offset}')
230
- return offset, buffer.decode('utf-16-be')
231
-
232
- def DecodeBlobWithLength(self) -> Tuple[int, bytes]:
233
- """Returns a tuple of a the offset of decoding and the binary blob."""
234
- offset, num_bytes = self.DecodeUint64Varint()
235
- _, blob = self.ReadBytes(num_bytes)
236
- return offset, blob
237
-
238
- def DecodeStringWithLength(self) -> Tuple[int, str]:
239
- """Returns a tuple of the offset of decoding and the string value."""
240
- offset, length = self.DecodeUint64Varint()
241
- _, buffer = self.ReadBytes(length*2)
242
- return offset, buffer.decode('utf-16-be')
243
-
244
-
245
210
  T = TypeVar('T')
246
211
 
247
212
 
248
- class FromStreamMixin: # TODO: refactor leveldb parsers
249
- """A mixin for dataclasses parsing their attributes from a binary stream."""
213
+ class FromDecoderMixin:
214
+ """A mixin for parsing dataclass attributes using a LevelDBDecoder."""
250
215
 
251
216
  @classmethod
252
217
  def FromDecoder(
253
- cls: Type[T], decoder: LevelDBDecoder, base_offset: int = 0) -> T:
254
- """Decodes a class type from the current position of a LevelDBDecoder.
218
+ cls: Type[T], decoder: StreamDecoder, base_offset: int = 0) -> T:
219
+ """Decodes a class type from the current position of a StreamDecoder.
255
220
 
256
221
  Args:
257
- decoder: the LevelDBDecoder.
222
+ decoder: the StreamDecoder.
258
223
  base_offset: the base offset.
259
224
 
260
225
  Returns:
@@ -277,8 +242,8 @@ class FromStreamMixin: # TODO: refactor leveldb parsers
277
242
  Returns:
278
243
  The class instance.
279
244
  """
280
- decoder = LevelDBDecoder(stream)
281
- return cls.FromDecoder(decoder, base_offset)
245
+ decoder = StreamDecoder(stream)
246
+ return cls.FromDecoder(decoder=decoder, base_offset=base_offset)
282
247
 
283
248
  @classmethod
284
249
  def FromBytes(
dfindexeddb/version.py CHANGED
@@ -14,7 +14,7 @@
14
14
  # limitations under the License.
15
15
  """Version information for dfIndexeddb."""
16
16
 
17
- __version__ = "20240301"
17
+ __version__ = "20240324"
18
18
 
19
19
 
20
20
  def GetVersion():