PyPI - dfindexeddb - Versions diffs - 20240224__py3-none-any.whl - Mend

dfindexeddb 20240224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

dfindexeddb/__init__.py +14 -0
dfindexeddb/errors.py +23 -0
dfindexeddb/indexeddb/__init__.py +13 -0
dfindexeddb/indexeddb/blink.py +115 -0
dfindexeddb/indexeddb/chromium.py +1283 -0
dfindexeddb/indexeddb/definitions.py +306 -0
dfindexeddb/indexeddb/v8.py +642 -0
dfindexeddb/leveldb/__init__.py +14 -0
dfindexeddb/leveldb/ldb.py +259 -0
dfindexeddb/leveldb/log.py +308 -0
dfindexeddb/utils.py +295 -0
dfindexeddb/version.py +22 -0
dfindexeddb-20240224.dist-info/AUTHORS +12 -0
dfindexeddb-20240224.dist-info/LICENSE +202 -0
dfindexeddb-20240224.dist-info/METADATA +277 -0
dfindexeddb-20240224.dist-info/RECORD +18 -0
dfindexeddb-20240224.dist-info/WHEEL +5 -0
dfindexeddb-20240224.dist-info/top_level.txt +1 -0

dfindexeddb/leveldb/ldb.py ADDED Viewed

@@ -0,0 +1,259 @@
+# -*- coding: utf-8 -*-
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser for LevelDB Table (.ldb) files."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+import io
+import os
+from typing import BinaryIO, Iterable, Tuple
+import snappy
+from dfindexeddb import utils
+@dataclass
+class LdbKeyValueRecord:
+  """A leveldb table key-value record.
+  Attributes:
+    offset: the offset of the record.
+    key: the key of the record.
+    value: the value of the record.
+    sequence_number: the sequence number of the record.
+    type: the type of the record.
+  """
+  offset: int
+  key: bytes
+  value: bytes
+  sequence_number: int
+  type: int
+  PACKED_SEQUENCE_AND_TYPE_LENGTH = 8
+  SEQUENCE_LENGTH = 7
+  TYPE_LENGTH = 1
+  @classmethod
+  def FromDecoder(
+      cls, decoder: utils.LevelDBDecoder, block_offset: int, shared_key: bytes
+  ) -> Tuple[LdbKeyValueRecord, bytes]:
+    """Decodes a ldb key value record.
+    Args:
+      decoder: the leveldb decoder.
+      block_offset: the block offset.
+      shared_key: the shared key bytes.
+    Returns:
+      A tuple of the parsed LdbKeyValueRecord and the updated shared key bytes.
+    """
+    offset, shared_bytes = decoder.DecodeUint32Varint()
+    _, unshared_bytes = decoder.DecodeUint32Varint()
+    _, value_length = decoder.DecodeUint32Varint()
+    _, key_delta = decoder.ReadBytes(unshared_bytes)
+    _, value = decoder.ReadBytes(value_length)
+    shared_key = shared_key[:shared_bytes] + key_delta
+    key = shared_key[:-cls.PACKED_SEQUENCE_AND_TYPE_LENGTH]
+    sequence_number = int.from_bytes(
+        key[-cls.SEQUENCE_LENGTH:], byteorder='little', signed=False)
+    key_type = shared_key[-cls.PACKED_SEQUENCE_AND_TYPE_LENGTH]
+    return cls(offset + block_offset, key, value, sequence_number,
+               key_type), shared_key
+@dataclass
+class LdbBlock:
+  """A leveldb table block.
+  Attributes:
+    offset: the offset of the block.
+    block_offset:
+  """
+  offset: int
+  block_offset: int
+  length: int
+  data: bytes = field(repr=False)
+  footer: bytes  # 5 bytes = 1 byte compressed flag + 4 bytes checksum.
+  COMPRESSED = 1
+  RESTART_ENTRY_LENGTH = 4
+  def IsCompressed(self) -> bool:
+    """Returns true if the block is compressed."""
+    return self.footer[0] == self.COMPRESSED
+  def GetBuffer(self) -> bytes:
+    """Returns the block buffer, decompressing if required."""
+    if self.IsCompressed():
+      return snappy.decompress(self.data)
+    return self.data
+  def GetRecords(self) -> Iterable[LdbKeyValueRecord]:
+    """Returns an iterator over the key value records in the block.
+    Yields:
+      LdbKeyValueRecords
+    """
+    # get underlying block content, decompressing if required
+    buffer = self.GetBuffer()
+    decoder = utils.LevelDBDecoder(io.BytesIO(buffer))
+    # trailer of a block has the form:
+    #    restarts: uint32[num_restarts]
+    #    num_restarts: uint32
+    decoder.stream.seek(-self.RESTART_ENTRY_LENGTH, os.SEEK_END)
+    _, num_restarts = decoder.DecodeUint32()
+    restarts_offset = (
+        decoder.stream.tell()) - (num_restarts + 1) * self.RESTART_ENTRY_LENGTH
+    decoder.stream.seek(restarts_offset)
+    _, offset = decoder.DecodeUint32()
+    decoder.stream.seek(offset)
+    key = b''
+    while decoder.stream.tell() < restarts_offset:
+      key_value_record, key = LdbKeyValueRecord.FromDecoder(
+          decoder, self.block_offset, key)
+      yield key_value_record
+    # TODO: parse trailer of the block for restart points (where the full
+    # key is stored to allow for binary lookup).  It's not needed at this time
+    # since we are sequentially iterating over the records in the block/file.
+@dataclass
+class BlockHandle:
+  """A handle to a block in the ldb file.
+  Attributes:
+    offset: the offset of the block handle.
+    block_offset: the offset of the block.
+    length: the length of the block.
+  """
+  offset: int
+  block_offset: int
+  length: int
+  BLOCK_TRAILER_SIZE = 5
+  def Load(self, stream: BinaryIO) -> LdbBlock:
+    """Loads the block data.
+    Args:
+      stream: the binary stream of the ldb file.
+    Returns:
+      a LdbBlock.
+    Raises:
+      ValueError: if it could not read all of the block or block footer.
+    """
+    stream.seek(self.block_offset, os.SEEK_SET)
+    data = stream.read(self.length)
+    if len(data) != self.length:
+      raise ValueError('Could not read all of the block')
+    footer = stream.read(self.BLOCK_TRAILER_SIZE)
+    if len(footer) != self.BLOCK_TRAILER_SIZE:
+      raise ValueError('Could not read all of the block footer')
+    return LdbBlock(self.offset, self.block_offset, self.length, data, footer)
+  @classmethod
+  def FromStream(cls, stream: BinaryIO, base_offset: int = 0) -> BlockHandle:
+    """Reads a block handle from a binary stream.
+    Args:
+      stream: the binary stream.
+      base_offset: the base offset.
+    Returns:
+      A BlockHandle.
+    """
+    decoder = utils.LevelDBDecoder(stream)
+    offset, block_offset = decoder.DecodeUint64Varint()
+    _, length = decoder.DecodeUint64Varint()
+    return cls(offset + base_offset, block_offset, length)
+class LdbFileReader:
+  """A leveldb table (.ldb or .sst) file reader.
+  A LdbFileReader provides read-only sequential iteration of serialized
+  structures in a leveldb ldb file.  These structures include:
+  * blocks (LdbBlock)
+  * records (LdbKeyValueRecord)
+  """
+  FOOTER_SIZE = 48
+  MAGIC = b'\x57\xfb\x80\x8b\x24\x75\x47\xdb'
+  def __init__(self, filename: str):
+    """Initializes the LogFile.
+    Args:
+      filename: the .ldb filename.
+    Raises:
+      ValueError if the file has an invalid magic number at the end.
+    """
+    self.filename = filename
+    with open(self.filename, 'rb') as fh:
+      fh.seek(-len(self.MAGIC), os.SEEK_END)
+      if fh.read(len(self.MAGIC)) != self.MAGIC:
+        raise ValueError(f'Invalid magic number in {self.filename}')
+      fh.seek(-self.FOOTER_SIZE, os.SEEK_END)
+      # meta_handle, need to read first due to variable integers
+      _ = BlockHandle.FromStream(fh)
+      index_handle = BlockHandle.FromStream(fh)
+      # self.meta_block = meta_handle.load(fh)  # TODO: support meta blocks
+      self.index_block = index_handle.Load(fh)
+  def GetBlocks(self) -> Iterable[LdbBlock]:
+    """Returns an iterator of LdbBlocks.
+    Yields:
+      LdbBlock.
+    """
+    with open(self.filename, 'rb') as fh:
+      for key_value_record in self.index_block.GetRecords():
+        block_handle = BlockHandle.FromStream(
+            io.BytesIO(key_value_record.value),
+            base_offset=key_value_record.offset)
+        yield block_handle.Load(fh)
+  def GetKeyValueRecords(self) -> Iterable[LdbKeyValueRecord]:
+    """Returns an iterator of LdbKeyValueRecords.
+    Yields:
+      LdbKeyValueRecords.
+    """
+    for block in self.GetBlocks():
+      for record in block.GetRecords():
+        yield record
+  def RangeIter(self) -> Iterable[Tuple[bytes, bytes]]:  #pylint: disable=C0103
+    """Returns an iterator of key-value pairs.
+    Yields:
+      A tuple of key and value as bytes.
+    """
+    for record in self.GetKeyValueRecords():
+      yield (record.key, record.value)

dfindexeddb/leveldb/log.py ADDED Viewed

@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser for LevelDB Log (.log) files."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import IntEnum
+import io
+from typing import BinaryIO, Generator, Iterable, Optional
+from dfindexeddb import errors
+from dfindexeddb import utils
+class LogFilePhysicalRecordType(IntEnum):
+  """LevelDB log file physical record types."""
+  FULL = 1
+  FIRST = 2
+  MIDDLE = 3
+  LAST = 4
+@dataclass
+class ParsedInternalKey:
+  """An internal key record from a leveldb log file.
+  Attributes:
+    offset: the offset of the record.
+    type: the record type.
+    key: the record key.
+    value: the record value.
+  """
+  offset: int
+  type: int
+  key: bytes
+  value: bytes
+  @classmethod
+  def FromDecoder(
+      cls,
+      decoder: utils.LevelDBDecoder,
+      base_offset: int = 0
+  ) -> ParsedInternalKey:
+    """Decodes an internal key value record.
+    Args:
+      decoder: the leveldb decoder.
+      base_offset: the base offset for the parsed key value record.
+    Returns:
+      a ParsedInternalKey
+    Raises:
+      ValueError: if there is an invalid record type encountered.
+    """
+    offset, record_type = decoder.DecodeUint8()
+    _, key = decoder.DecodeBlobWithLength()
+    if record_type == 1:
+      _, value = decoder.DecodeBlobWithLength()
+    elif record_type == 0:
+      value =  b''
+    else:
+      raise ValueError(f'Invalid record type {record_type}')
+    return cls(base_offset + offset, record_type, key, value)
+@dataclass
+class WriteBatch:
+  """A write batch from a leveldb log file.
+  Attributes:
+    offset: the batch offset.
+    sequence_number: the batch sequence number.
+    count: the number of ParsedInternalKey in the batch.
+    records: the ParsedInternalKey parsed from the batch.
+  """
+  offset: int
+  sequence_number: int
+  count: int
+  records: Iterable[ParsedInternalKey] = field(repr=False)
+  @classmethod
+  def FromStream(
+    cls, stream: BinaryIO, base_offset: int = 0
+  ) -> WriteBatch:
+    """Parses a WriteBatch from a binary stream.
+    Args:
+      stream: the binary stream to be parsed.
+      base_offset: the base offset of the Block from which the data is
+          read from.
+    Returns:
+      A WriteBatch.
+    """
+    decoder = utils.LevelDBDecoder(stream)
+    _, sequence_number = decoder.DecodeUint64()
+    _, count = decoder.DecodeUint32()
+    records = []
+    for _ in range(count):
+      record = ParsedInternalKey.FromDecoder(decoder, base_offset)
+      records.append(record)
+    return cls(base_offset, sequence_number, count, records)
+  @classmethod
+  def FromBytes(cls, data: bytes, base_offset: int = 0) -> WriteBatch:
+    """Parses a WriteBatch from bytes.
+    Args:
+      data: the bytes to be parsed.
+      base_offset: the base offset of the Block from which the data is
+          read from.
+    Returns:
+      A WriteBatch.
+    """
+    return cls.FromStream(io.BytesIO(data), base_offset)
+@dataclass
+class PhysicalRecord:
+  """A physical record from a leveldb log file.
+  Attributes:
+    offset: the record offset.
+    checksum: the record checksum.
+    length: the length of the record in bytes.
+    type: the record type.
+    contents: the record contents.
+    contents_offset: the offset of where the record contents are stored.
+  """
+  base_offset: int
+  offset: int
+  checksum: int
+  length: int
+  record_type: LogFilePhysicalRecordType
+  contents: bytes = field(repr=False)
+  contents_offset: int
+  @classmethod
+  def FromStream(
+      cls, stream: BinaryIO, base_offset: int = 0) -> PhysicalRecord:
+    """Parses a PhysicalRecord from a binary stream.
+    Args:
+      stream: the binary stream to be parsed.
+      base_offset: the base offset of the WriteBatch from which the data is
+          read from.
+    Returns:
+      A PhysicalRecord.
+    """
+    decoder = utils.StreamDecoder(stream)
+    offset, checksum = decoder.DecodeUint32()
+    _, length = decoder.DecodeUint16()
+    record_type = LogFilePhysicalRecordType(decoder.DecodeUint8()[1])
+    contents_offset, contents = decoder.ReadBytes(length)
+    return cls(
+        base_offset=base_offset,
+        offset=offset,
+        checksum=checksum,
+        length=length,
+        record_type=record_type,
+        contents=contents,
+        contents_offset=contents_offset)
+@dataclass
+class Block:
+  """A block from a leveldb log file.
+  Attributes:
+    offset: the block offset.
+    data: the block data.
+  """
+  offset: int
+  data: bytes = field(repr=False)
+  BLOCK_SIZE = 32768
+  def GetPhysicalRecords(self) -> Generator[PhysicalRecord, None, None]:
+    """Returns a generator of LogFilePhysicalRecords from the block contents.
+    Yields:
+      LogFileRecord
+    """
+    buffer = io.BytesIO(self.data)
+    while True:
+      try:
+        yield PhysicalRecord.FromStream(buffer, base_offset=self.offset)
+      except errors.DecoderError:
+        return
+  @classmethod
+  def FromStream(cls, stream: BinaryIO) -> Optional[Block]:
+    """Parses a Block from a binary stream.
+    Args:
+      stream: the binary stream to be parsed.
+    Returns:
+      the Block or None if there is no data to read from the stream."""
+    offset = stream.tell()
+    data = stream.read(cls.BLOCK_SIZE)  # reads full and partial blocks
+    if not data:
+      return None
+    return cls(offset, data)
+class LogFileReader:
+  """A leveldb log file reader.
+  A LogFileReader provides read-only sequential iteration of serialized
+  structures in a leveldb logfile.  These structures include:
+  * blocks (Block)
+  * phyiscal records (PhysicalRecord)
+  * batches (WriteBatch) and
+  * key/value records (ParsedInternalKey).
+  Attributes:
+    filename (str): the leveldb log filename.
+  """
+  def __init__(self, filename: str):
+    """Initializes the LogFile.
+    Args:
+      filename: the leveldb log filename
+    """
+    self.filename = filename
+  def GetBlocks(self) -> Generator[Block, None, None]:
+    """Returns an iterator of Block instances.
+    A logfile is composed of one or more blocks.
+    Yields:
+      a Block
+    """
+    with open(self.filename, 'rb') as fh:
+      while True:
+        block = Block.FromStream(fh)
+        if not block:
+          break
+        yield block
+  def GetPhysicalRecords(self) -> Generator[PhysicalRecord, None, None]:
+    """Returns an iterator of PhysicalRecord instances.
+    A block is composed of one or more physical records.
+    Yields:
+      PhysicalRecord
+    """
+    for block in self.GetBlocks():
+      for physical_record in block.GetPhysicalRecords():
+        yield physical_record
+  def GetWriteBatches(self) -> Generator[WriteBatch, None, None]:
+    """Returns an iterator of WriteBatch instances.
+    Depending on the batch size, a log file batch can be spread across one or
+    more physical records.
+    Yields:
+      WriteBatch
+    """
+    buffer = bytearray()
+    for physical_record in self.GetPhysicalRecords():
+      if physical_record.record_type == LogFilePhysicalRecordType.FULL:
+        buffer = physical_record.contents
+        offset = physical_record.contents_offset + physical_record.base_offset
+        yield WriteBatch.FromBytes(buffer, base_offset=offset)
+        buffer = bytearray()
+      elif physical_record.record_type == LogFilePhysicalRecordType.FIRST:
+        offset = physical_record.contents_offset + physical_record.base_offset
+        buffer = bytearray(physical_record.contents)
+      elif physical_record.record_type == LogFilePhysicalRecordType.MIDDLE:
+        buffer.extend(bytearray(physical_record.contents))
+      elif physical_record.record_type == LogFilePhysicalRecordType.LAST:
+        buffer.extend(bytearray(physical_record.contents))
+        yield WriteBatch.FromBytes(buffer, base_offset=offset)
+        buffer = bytearray()
+  def GetKeyValueRecords(self) -> Generator[ParsedInternalKey, None, None]:
+    """Returns an iterator of KeyValueRecord instances.
+    A batch can contain on or more key value records.
+    Yields:
+      KeyValueRecord
+    """
+    for batch in self.GetWriteBatches():
+      for record in batch.records:
+        yield record