PyPI - asebytes - Versions diffs - 0.1.0__tar.gz - Mend

asebytes 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

asebytes-0.1.0/PKG-INFO +48 -0
asebytes-0.1.0/README.md +37 -0
asebytes-0.1.0/pyproject.toml +38 -0
asebytes-0.1.0/src/asebytes/__init__.py +10 -0
asebytes-0.1.0/src/asebytes/decode.py +102 -0
asebytes-0.1.0/src/asebytes/encode.py +68 -0
asebytes-0.1.0/src/asebytes/io.py +558 -0
asebytes-0.1.0/src/asebytes/metadata.py +88 -0

asebytes-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,48 @@
+Metadata-Version: 2.3
+Name: asebytes
+Version: 0.1.0
+Summary: Add your description here
+Requires-Dist: ase>=3.26.0
+Requires-Dist: lmdb>=1.7.5
+Requires-Dist: msgpack>=1.1.2
+Requires-Dist: msgpack-numpy>=0.4.8
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+# asebytes
+Efficient serialization and storage for ASE Atoms objects using LMDB.
+## API
+- **`encode(atoms)`** - Encode an ASE Atoms object to a dict of bytes
+- **`decode(data)`** - Decode bytes back into an ASE Atoms object
+- **`BytesIO(file, prefix)`** - LMDB-backed list-like storage for bytes dictionaries
+- **`ASEIO(file, prefix)`** - LMDB-backed list-like storage for ASE Atoms objects
+## Examples
+```python
+from asebytes import ASEIO, BytesIO, encode, decode
+import molify
+# Generate conformers from SMILES
+ethanol = molify.smiles2conformers("CCO", numConfs=100)
+# Serialize/deserialize single molecule
+data = encode(ethanol[0])
+atoms_restored = decode(data)
+# High-level: Store Atoms objects directly
+db = ASEIO("conformers.lmdb")
+db.extend(ethanol)  # Add all conformers
+mol = db[0]         # Returns ase.Atoms
+# Low-level: BytesIO stores serialized data
+bytes_db = BytesIO("conformers.lmdb")
+bytes_db.append(encode(ethanol[0]))      # Manual serialization
+data = bytes_db[0]                       # Returns dict[bytes, bytes]
+mol = decode(data)                       # Manual deserialization
+# ASEIO = BytesIO + automatic encode/decode
+```

asebytes-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,37 @@
+# asebytes
+Efficient serialization and storage for ASE Atoms objects using LMDB.
+## API
+- **`encode(atoms)`** - Encode an ASE Atoms object to a dict of bytes
+- **`decode(data)`** - Decode bytes back into an ASE Atoms object
+- **`BytesIO(file, prefix)`** - LMDB-backed list-like storage for bytes dictionaries
+- **`ASEIO(file, prefix)`** - LMDB-backed list-like storage for ASE Atoms objects
+## Examples
+```python
+from asebytes import ASEIO, BytesIO, encode, decode
+import molify
+# Generate conformers from SMILES
+ethanol = molify.smiles2conformers("CCO", numConfs=100)
+# Serialize/deserialize single molecule
+data = encode(ethanol[0])
+atoms_restored = decode(data)
+# High-level: Store Atoms objects directly
+db = ASEIO("conformers.lmdb")
+db.extend(ethanol)  # Add all conformers
+mol = db[0]         # Returns ase.Atoms
+# Low-level: BytesIO stores serialized data
+bytes_db = BytesIO("conformers.lmdb")
+bytes_db.append(encode(ethanol[0]))      # Manual serialization
+data = bytes_db[0]                       # Returns dict[bytes, bytes]
+mol = decode(data)                       # Manual deserialization
+# ASEIO = BytesIO + automatic encode/decode
+```

asebytes-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,38 @@
+[project]
+name = "asebytes"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "ase>=3.26.0",
+    "lmdb>=1.7.5",
+    "msgpack>=1.1.2",
+    "msgpack-numpy>=0.4.8",
+]
+[build-system]
+requires = ["uv_build>=0.9.6,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "ase-db-backends>=0.10.0",
+    "ipykernel>=7.1.0",
+    "matplotlib>=3.10.7",
+    "molify>=0.0.1a0",
+    "pandas>=2.3.3",
+    "pytest>=8.4.2",
+    "pytest-benchmark>=5.2.1",
+]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = ["-v", "--strict-markers", "-m", "not benchmark"]
+markers = [
+    "benchmark: marks tests as benchmark tests (deselect with '-m \"not benchmark\"')",
+]

asebytes-0.1.0/src/asebytes/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+import importlib.metadata
+from .decode import decode
+from .io import ASEIO, BytesIO
+from .metadata import get_metadata
+from .encode import encode
+__all__ = ["encode", "decode", "BytesIO", "ASEIO", "get_metadata"]
+__version__ = importlib.metadata.version("asebytes")

asebytes-0.1.0/src/asebytes/decode.py ADDED Viewed

@@ -0,0 +1,102 @@
+import ase
+import msgpack
+import msgpack_numpy as m
+import numpy as np
+from ase.calculators.singlepoint import SinglePointCalculator
+from ase.cell import Cell
+def decode(data: dict[bytes, bytes], fast: bool = True) -> ase.Atoms:
+    """
+    Deserialize bytes into an ASE Atoms object.
+    Parameters
+    ----------
+    data : dict[bytes, bytes]
+        Dictionary with byte keys and msgpack-serialized byte values.
+    fast : bool, default=True
+        If True, use optimized direct attribute assignment (6x faster).
+        If False, use standard Atoms constructor (safer but slower).
+    Returns
+    -------
+    ase.Atoms
+        Reconstructed Atoms object.
+    Raises
+    ------
+    ValueError
+        If unknown keys are present in data.
+    KeyError
+        If required key 'arrays.numbers' is missing.
+    """
+    if b"arrays.numbers" in data:
+        numbers_array = msgpack.unpackb(data[b"arrays.numbers"], object_hook=m.decode)
+    else:
+        numbers_array = np.array([], dtype=int)
+    # Extract optional parameters with defaults
+    if b"cell" in data:
+        cell_array = msgpack.unpackb(data[b"cell"], object_hook=m.decode)
+    else:
+        cell_array = None
+    if b"pbc" in data:
+        pbc_array = msgpack.unpackb(data[b"pbc"], object_hook=m.decode)
+    else:
+        pbc_array = np.array([False, False, False], dtype=bool)
+    if fast:
+        #  Skip Atoms.__init__() and directly assign attributes for better performance
+        atoms = ase.Atoms.__new__(ase.Atoms)
+        # Set cell - use provided cell or default empty cell
+        if cell_array is not None:
+            atoms._cellobj = Cell(cell_array)
+        else:
+            atoms._cellobj = Cell(np.zeros((3, 3)))
+        atoms._pbc = pbc_array
+        atoms.arrays = {"numbers": numbers_array}
+        # Initialize positions if not provided
+        if b"arrays.positions" not in data:
+            # Create default positions (zeros) based on number of atoms
+            n_atoms = len(numbers_array)
+            atoms.arrays["positions"] = np.zeros((n_atoms, 3))
+        atoms.info = {}
+        atoms.constraints = []
+        atoms._celldisp = np.zeros(3)
+        atoms._calc = None
+    else:
+        # Use standard Atoms constructor
+        atoms = ase.Atoms(numbers=numbers_array, cell=cell_array, pbc=pbc_array)
+    for key in data:
+        if key in [b"cell", b"pbc", b"arrays.numbers"]:
+            continue
+        if key.startswith(b"arrays."):
+            array_data = msgpack.unpackb(data[key], object_hook=m.decode)
+            atoms.arrays[key.decode().split("arrays.")[1]] = array_data
+        elif key.startswith(b"info."):
+            info_key = key.decode().split("info.")[1]
+            info_array = msgpack.unpackb(data[key], object_hook=m.decode)
+            atoms.info[info_key] = info_array
+        elif key.startswith(b"calc."):
+            if not hasattr(atoms, "calc") or atoms.calc is None:
+                atoms.calc = SinglePointCalculator(atoms)
+            calc_key = key.decode().split("calc.")[1]
+            calc_array = msgpack.unpackb(data[key], object_hook=m.decode)
+            atoms.calc.results[calc_key] = calc_array
+        elif key == b"constraints":
+            constraints_data = msgpack.unpackb(data[key], object_hook=m.decode)
+            constraints = []
+            for constraint_dict in constraints_data:
+                constraint = ase.constraints.dict2constraint(constraint_dict)
+                constraints.append(constraint)
+            atoms.set_constraint(constraints)
+        else:
+            raise ValueError(f"Unknown key in data: {key}")
+    return atoms

asebytes-0.1.0/src/asebytes/encode.py ADDED Viewed

@@ -0,0 +1,68 @@
+import ase
+import msgpack
+import msgpack_numpy as m
+import numpy as np
+def encode(atoms: ase.Atoms) -> dict[bytes, bytes]:
+    """
+    Serialize an ASE Atoms object into a dictionary of bytes.
+    Parameters
+    ----------
+    atoms : ase.Atoms
+        Atoms object to serialize.
+    Returns
+    -------
+    dict[bytes, bytes]
+        Dictionary with byte keys and msgpack-serialized byte values.
+    Raises
+    ------
+    TypeError
+        If input is not an ase.Atoms object.
+    ValueError
+        If any key in atoms.arrays, atoms.info, or atoms.calc.results contains a dot.
+    """
+    if not isinstance(atoms, ase.Atoms):
+        raise TypeError("Input must be an ase.Atoms object.")
+    data: dict[bytes, bytes] = {}
+    cell: np.ndarray = atoms.get_cell().array
+    data[b"cell"] = msgpack.packb(cell, default=m.encode)
+    data[b"pbc"] = msgpack.packb(atoms.get_pbc(), default=m.encode)
+    for key in atoms.arrays:
+        if "." in key:
+            raise ValueError(
+                f"Key '{key}' in atoms.arrays contains a dot (.), which is not allowed as it is used as a path separator."
+            )
+        data[f"arrays.{key}".encode()] = msgpack.packb(
+            atoms.arrays[key], default=m.encode
+        )
+    for key in atoms.info:
+        if "." in key:
+            raise ValueError(
+                f"Key '{key}' in atoms.info contains a dot (.), which is not allowed as it is used as a path separator."
+            )
+        value = atoms.info[key]
+        data[f"info.{key}".encode()] = msgpack.packb(value, default=m.encode)
+    if atoms.calc is not None:
+        for key in atoms.calc.results:
+            if "." in key:
+                raise ValueError(
+                    f"Key '{key}' in atoms.calc.results contains a dot (.), which is not allowed as it is used as a path separator."
+                )
+            value = atoms.calc.results[key]
+            data[f"calc.{key}".encode()] = msgpack.packb(value, default=m.encode)
+    # Serialize constraints
+    if atoms.constraints:
+        constraints_data = []
+        for constraint in atoms.constraints:
+            if isinstance(constraint, ase.constraints.FixConstraint):
+                constraints_data.append(constraint.todict())
+        if constraints_data:
+            data[b"constraints"] = msgpack.packb(constraints_data, default=m.encode)
+    return data

asebytes-0.1.0/src/asebytes/io.py ADDED Viewed

@@ -0,0 +1,558 @@
+from collections.abc import MutableSequence
+from typing import Iterator
+import ase
+import lmdb
+from asebytes.decode import decode
+from asebytes.encode import encode
+class ASEIO(MutableSequence):
+    """
+    LMDB-backed mutable sequence for ASE Atoms objects.
+    Parameters
+    ----------
+    file : str
+        Path to LMDB database file.
+    prefix : bytes, default=b""
+        Key prefix for namespacing entries.
+    """
+    def __init__(self, file: str, prefix: bytes = b""):
+        self.io = BytesIO(file, prefix)
+    def __getitem__(self, index: int) -> ase.Atoms:
+        data = self.io[index]
+        return decode(data)
+    def __setitem__(self, index: int, value: ase.Atoms) -> None:
+        data = encode(value)
+        self.io[index] = data
+    def __delitem__(self, index: int) -> None:
+        del self.io[index]
+    def insert(self, index: int, value: ase.Atoms) -> None:
+        data = encode(value)
+        self.io.insert(index, data)
+    def extend(self, values: list[ase.Atoms]) -> None:
+        """
+        Efficiently extend with multiple Atoms objects using bulk operations.
+        Serializes all Atoms objects first, then performs a single bulk transaction.
+        Much faster than calling append() in a loop.
+        Parameters
+        ----------
+        values : list[ase.Atoms]
+            Atoms objects to append.
+        """
+        # Serialize all atoms objects first
+        serialized_data = [encode(atoms) for atoms in values]
+        # Use BytesIO's bulk extend (single transaction)
+        self.io.extend(serialized_data)
+    def __len__(self) -> int:
+        return len(self.io)
+    def __iter__(self) -> Iterator:
+        for i in range(len(self)):
+            yield self[i]
+    def get_available_keys(self, index: int) -> list[bytes]:
+        """
+        Get all available keys for a given index.
+        Parameters
+        ----------
+        index : int
+            Logical index to query.
+        Returns
+        -------
+        list[bytes]
+            Available keys at the index.
+        Raises
+        ------
+        KeyError
+            If the index does not exist.
+        """
+        return self.io.get_available_keys(index)
+    def get(self, index: int, keys: list[bytes] | None = None) -> ase.Atoms:
+        """
+        Get Atoms object at index, optionally filtering to specific keys.
+        Parameters
+        ----------
+        index : int
+            Logical index to retrieve.
+        keys : list[bytes], optional
+            Keys to retrieve (e.g., b"arrays.positions", b"info.smiles", b"calc.energy").
+            If None, returns all data.
+        Returns
+        -------
+        ase.Atoms
+            Atoms object reconstructed from the requested keys.
+        Raises
+        ------
+        KeyError
+            If the index does not exist.
+        """
+        data = self.io.get(index, keys=keys)
+        return decode(data)
+class BytesIO(MutableSequence):
+    """
+    LMDB-backed mutable sequence for byte dictionaries.
+    Parameters
+    ----------
+    file : str
+        Path to LMDB database file.
+    prefix : bytes, default=b""
+        Key prefix for namespacing entries.
+    """
+    def __init__(self, file: str, prefix: bytes = b""):
+        self.file = file
+        self.prefix = prefix
+        self.env = lmdb.open(
+            file,
+            # map_size=1099511627776,
+            # subdir=False,
+            # readonly=False,
+            # lock=True,
+            # readahead=True,
+            # meminit=False,
+        )
+    # Metadata helpers
+    def _get_count(self, txn) -> int:
+        """Get the current count from metadata (returns 0 if not set)."""
+        count_key = self.prefix + b"__meta__count"
+        count_bytes = txn.get(count_key)
+        if count_bytes is None:
+            return 0
+        return int(count_bytes.decode())
+    def _set_count(self, txn, count: int) -> None:
+        """Set the count in metadata."""
+        count_key = self.prefix + b"__meta__count"
+        txn.put(count_key, str(count).encode())
+    def _get_next_sort_key(self, txn) -> int:
+        """Get the next available sort key counter (returns 0 if not set)."""
+        key = self.prefix + b"__meta__next_sort_key"
+        value = txn.get(key)
+        if value is None:
+            return 0
+        return int(value.decode())
+    def _set_next_sort_key(self, txn, value: int) -> None:
+        """Set the next available sort key counter."""
+        key = self.prefix + b"__meta__next_sort_key"
+        txn.put(key, str(value).encode())
+    # Mapping helpers (logical_index → sort_key)
+    def _get_mapping(self, txn, logical_index: int) -> int | None:
+        """Get sort_key for a logical index (returns None if not found)."""
+        mapping_key = self.prefix + b"__idx__" + str(logical_index).encode()
+        sort_key_bytes = txn.get(mapping_key)
+        if sort_key_bytes is None:
+            return None
+        return int(sort_key_bytes.decode())
+    def _set_mapping(self, txn, logical_index: int, sort_key: int) -> None:
+        """Set the mapping from logical_index to sort_key."""
+        mapping_key = self.prefix + b"__idx__" + str(logical_index).encode()
+        txn.put(mapping_key, str(sort_key).encode())
+    def _delete_mapping(self, txn, logical_index: int) -> None:
+        """Delete the mapping for a logical index."""
+        mapping_key = self.prefix + b"__idx__" + str(logical_index).encode()
+        txn.delete(mapping_key)
+    def _allocate_sort_key(self, txn) -> int:
+        """Allocate a new unique sort key by incrementing the counter."""
+        next_key = self._get_next_sort_key(txn)
+        self._set_next_sort_key(txn, next_key + 1)
+        return next_key
+    # Metadata helpers for field keys
+    def _get_field_keys_metadata(self, txn, sort_key: int) -> list[bytes] | None:
+        """
+        Get field keys for a sort key from metadata.
+        Parameters
+        ----------
+        txn : lmdb.Transaction
+            LMDB transaction.
+        sort_key : int
+            Sort key to query.
+        Returns
+        -------
+        list[bytes] or None
+            Field keys (without prefix) or None if not found.
+        """
+        metadata_key = self.prefix + b"__keys__" + str(sort_key).encode()
+        metadata_bytes = txn.get(metadata_key)
+        if metadata_bytes is None:
+            return None
+        # Deserialize the list of keys (stored as newline-separated bytes)
+        return metadata_bytes.split(b"\n") if metadata_bytes else []
+    def _set_field_keys_metadata(
+        self, txn, sort_key: int, field_keys: list[bytes]
+    ) -> None:
+        """
+        Store field keys for a sort key in metadata.
+        Parameters
+        ----------
+        txn : lmdb.Transaction
+            LMDB transaction.
+        sort_key : int
+            Sort key.
+        field_keys : list[bytes]
+            Field keys (without prefix).
+        """
+        metadata_key = self.prefix + b"__keys__" + str(sort_key).encode()
+        # Serialize as newline-separated bytes
+        metadata_bytes = b"\n".join(field_keys)
+        txn.put(metadata_key, metadata_bytes)
+    def _delete_field_keys_metadata(self, txn, sort_key: int) -> None:
+        """
+        Delete field keys metadata for a sort key.
+        Parameters
+        ----------
+        txn : lmdb.Transaction
+            LMDB transaction.
+        sort_key : int
+            Sort key.
+        """
+        metadata_key = self.prefix + b"__keys__" + str(sort_key).encode()
+        txn.delete(metadata_key)
+    def __setitem__(self, index: int, data: dict[bytes, bytes]) -> None:
+        with self.env.begin(write=True) as txn:
+            current_count = self._get_count(txn)
+            # Get or allocate sort key for this index
+            sort_key = self._get_mapping(txn, index)
+            is_new_index = sort_key is None
+            if is_new_index:
+                # Allocate new unique sort key
+                sort_key = self._allocate_sort_key(txn)
+                self._set_mapping(txn, index, sort_key)
+            else:
+                # Delete existing data keys if overwriting
+                try:
+                    _, _, keys_to_delete = self._get_full_keys(txn, index)
+                    for key in keys_to_delete:
+                        txn.delete(key)
+                except KeyError:
+                    # No existing data, continue
+                    pass
+            # Write new data with sort key prefix using putmulti
+            sort_key_str = str(sort_key).encode()
+            items_to_insert = [
+                (self.prefix + sort_key_str + b"-" + key, value)
+                for key, value in data.items()
+            ]
+            if items_to_insert:
+                cursor = txn.cursor()
+                cursor.putmulti(items_to_insert, dupdata=False)
+            # Store metadata for field keys
+            field_keys = list(data.keys())
+            self._set_field_keys_metadata(txn, sort_key, field_keys)
+            # Update count if needed (when index == current_count, we're appending)
+            if is_new_index and index >= current_count:
+                self._set_count(txn, index + 1)
+    def _get_full_keys(self, txn, index: int) -> tuple[int, bytes, list[bytes]]:
+        """
+        Get sort key, prefix, and all full keys for an index.
+        Parameters
+        ----------
+        txn : lmdb.Transaction
+            LMDB transaction.
+        index : int
+            Logical index to query.
+        Returns
+        -------
+        tuple[int, bytes, list[bytes]]
+            Tuple of (sort_key, prefix, full keys including prefix).
+        Raises
+        ------
+        KeyError
+            If the index does not exist.
+        """
+        # Look up the sort key for this logical index
+        sort_key = self._get_mapping(txn, index)
+        if sort_key is None:
+            raise KeyError(f"Index {index} not found")
+        # Build prefix
+        sort_key_str = str(sort_key).encode()
+        prefix = self.prefix + sort_key_str + b"-"
+        # Get field keys from metadata
+        field_keys = self._get_field_keys_metadata(txn, sort_key)
+        if field_keys is None:
+            raise KeyError(
+                f"Metadata not found for index {index} (sort_key {sort_key})"
+            )
+        # Build full keys with prefix
+        keys_to_fetch = [prefix + field_key for field_key in field_keys]
+        return sort_key, prefix, keys_to_fetch
+    def __getitem__(self, index: int) -> dict[bytes, bytes]:
+        with self.env.begin() as txn:
+            _, prefix, keys_to_fetch = self._get_full_keys(txn, index)
+            # Use getmulti for efficient batch retrieval
+            result = {}
+            if keys_to_fetch:
+                cursor = txn.cursor()
+                for key, value in cursor.getmulti(keys_to_fetch):
+                    # Extract the field name after the sort_key prefix
+                    field_name = key[len(prefix) :]
+                    result[field_name] = value
+            return result
+    def get_available_keys(self, index: int) -> list[bytes]:
+        """
+        Get all available keys for a given index.
+        Parameters
+        ----------
+        index : int
+            Logical index to query.
+        Returns
+        -------
+        list[bytes]
+            Available keys at the index.
+        Raises
+        ------
+        KeyError
+            If the index does not exist.
+        """
+        with self.env.begin() as txn:
+            _, prefix, keys_to_fetch = self._get_full_keys(txn, index)
+            # Extract field names from full keys
+            return [key[len(prefix) :] for key in keys_to_fetch]
+    def get(self, index: int, keys: list[bytes] | None = None) -> dict[bytes, bytes]:
+        """
+        Get data at index, optionally filtering to specific keys.
+        Parameters
+        ----------
+        index : int
+            Logical index to retrieve.
+        keys : list[bytes], optional
+            Keys to retrieve. If None, returns all keys.
+        Returns
+        -------
+        dict[bytes, bytes]
+            Key-value pairs. If keys provided, only existing keys are returned.
+        Raises
+        ------
+        KeyError
+            If the index does not exist.
+        """
+        with self.env.begin() as txn:
+            _, prefix, keys_to_fetch = self._get_full_keys(txn, index)
+            # Filter keys if requested
+            if keys is not None:
+                keys_set = set(keys)
+                # Filter to only the requested keys
+                keys_to_fetch = [
+                    k for k in keys_to_fetch if k[len(prefix) :] in keys_set
+                ]
+            # Use getmulti for efficient batch retrieval
+            result = {}
+            if keys_to_fetch:
+                cursor = txn.cursor()
+                for key, value in cursor.getmulti(keys_to_fetch):
+                    # Extract the field name after the sort_key prefix
+                    field_name = key[len(prefix) :]
+                    result[field_name] = value
+            return result
+    def __delitem__(self, key: int) -> None:
+        with self.env.begin(write=True) as txn:
+            current_count = self._get_count(txn)
+            if key < 0 or key >= current_count:
+                raise IndexError(f"Index {key} out of range [0, {current_count})")
+            # Get the sort key for this index and data keys before deleting mapping
+            sort_key = self._get_mapping(txn, key)
+            if sort_key is None:
+                raise KeyError(f"Index {key} not found")
+            # Get the data keys to delete before modifying mappings
+            _, _, keys_to_delete = self._get_full_keys(txn, key)
+            # Collect all mappings that need to be shifted
+            # We need to shift indices [key+1, key+2, ..., count-1] down by 1
+            mappings_to_shift = []
+            for i in range(key + 1, current_count):
+                sk = self._get_mapping(txn, i)
+                if sk is not None:
+                    mappings_to_shift.append((i, sk))
+            # Delete the mapping for the deleted index
+            self._delete_mapping(txn, key)
+            # Shift all subsequent mappings down by 1
+            # Delete old mappings first, then write new ones
+            for old_index, sk in mappings_to_shift:
+                self._delete_mapping(txn, old_index)
+            for old_index, sk in mappings_to_shift:
+                new_index = old_index - 1
+                self._set_mapping(txn, new_index, sk)
+            # Delete the data keys
+            for k in keys_to_delete:
+                txn.delete(k)
+            # Delete metadata for field keys
+            self._delete_field_keys_metadata(txn, sort_key)
+            # Update count
+            self._set_count(txn, current_count - 1)
+    def insert(self, index: int, input: dict[bytes, bytes]) -> None:
+        with self.env.begin(write=True) as txn:
+            current_count = self._get_count(txn)
+            # Clamp index to valid range [0, count]
+            if index < 0:
+                index = 0
+            if index > current_count:
+                index = current_count
+            # Collect all mappings that need to be shifted right
+            # We need to shift indices [index, index+1, ..., count-1] up by 1
+            mappings_to_shift = []
+            for i in range(index, current_count):
+                sk = self._get_mapping(txn, i)
+                if sk is not None:
+                    mappings_to_shift.append((i, sk))
+            # Shift all mappings up by 1
+            # Do this in reverse order to avoid conflicts
+            # Delete old mappings first, then write new ones
+            for old_index, sk in mappings_to_shift:
+                self._delete_mapping(txn, old_index)
+            for old_index, sk in reversed(mappings_to_shift):
+                new_index = old_index + 1
+                self._set_mapping(txn, new_index, sk)
+            # Allocate a new sort key for the new item
+            sort_key = self._allocate_sort_key(txn)
+            self._set_mapping(txn, index, sort_key)
+            # Write the new data with sort key prefix using putmulti
+            sort_key_str = str(sort_key).encode()
+            items_to_insert = [
+                (self.prefix + sort_key_str + b"-" + key, value)
+                for key, value in input.items()
+            ]
+            if items_to_insert:
+                cursor = txn.cursor()
+                cursor.putmulti(items_to_insert, dupdata=False)
+            # Store metadata for field keys
+            field_keys = list(input.keys())
+            self._set_field_keys_metadata(txn, sort_key, field_keys)
+            # Update count
+            self._set_count(txn, current_count + 1)
+    def extend(self, items: list[dict[bytes, bytes]]) -> None:
+        """
+        Efficiently extend the sequence with multiple items using bulk operations.
+        Parameters
+        ----------
+        items : list[dict[bytes, bytes]]
+            Dictionaries to append.
+        """
+        if not items:
+            return
+        with self.env.begin(write=True) as txn:
+            current_count = self._get_count(txn)
+            # Prepare all items with their mappings, data keys, and metadata
+            items_to_insert = []
+            for idx, item in enumerate(items):
+                logical_index = current_count + idx
+                sort_key = self._allocate_sort_key(txn)
+                sort_key_str = str(sort_key).encode()
+                # Add mapping entry
+                mapping_key = self.prefix + b"__idx__" + str(logical_index).encode()
+                items_to_insert.append((mapping_key, sort_key_str))
+                # Collect field keys and add data entries
+                field_keys = list(item.keys())
+                for field_key, field_value in item.items():
+                    data_key = self.prefix + sort_key_str + b"-" + field_key
+                    items_to_insert.append((data_key, field_value))
+                # Add metadata entry (inline with other inserts for single putmulti)
+                metadata_key = self.prefix + b"__keys__" + sort_key_str
+                metadata_value = b"\n".join(field_keys)
+                items_to_insert.append((metadata_key, metadata_value))
+            # Bulk insert all items (mappings + data + metadata) in one call
+            cursor = txn.cursor()
+            cursor.putmulti(items_to_insert, dupdata=False)
+            # Update count
+            self._set_count(txn, current_count + len(items))
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+    def __len__(self) -> int:
+        with self.env.begin() as txn:
+            return self._get_count(txn)

asebytes-0.1.0/src/asebytes/metadata.py ADDED Viewed

@@ -0,0 +1,88 @@
+import msgpack
+import msgpack_numpy as m
+import numpy as np
+def get_metadata(data: dict[bytes, bytes]) -> dict[str, dict]:
+    """
+    Extract type, shape, and dtype information from serialized data.
+    Parameters
+    ----------
+    data : dict[bytes, bytes]
+        Dictionary with byte keys and msgpack-serialized byte values.
+    Returns
+    -------
+    dict[str, dict]
+        Mapping of decoded string keys to metadata dictionaries.
+        Each metadata dict contains:
+        - For ndarrays: {"type": "ndarray", "dtype": str, "shape": tuple}
+        - For numpy scalars: {"type": "numpy_scalar", "dtype": str}
+        - For Python types: {"type": typename} where typename is one of
+          "str", "int", "float", "bool", "NoneType", "list", "dict"
+    """
+    metadata = {}
+    for key_bytes, value_bytes in data.items():
+        # Decode the key from bytes to string
+        key = key_bytes.decode("utf-8")
+        # Deserialize the value
+        value = msgpack.unpackb(value_bytes, object_hook=m.decode)
+        # Determine type and extract metadata
+        metadata[key] = _get_value_metadata(value)
+    return metadata
+def _get_value_metadata(value) -> dict:
+    """
+    Extract metadata for a single value.
+    Parameters
+    ----------
+    value : Any
+        Deserialized value.
+    Returns
+    -------
+    dict
+        Type information and additional metadata.
+    """
+    # Check for NumPy array
+    if isinstance(value, np.ndarray):
+        return {
+            "type": "ndarray",
+            "dtype": str(value.dtype),
+            "shape": value.shape,
+        }
+    # Check for NumPy scalar types
+    if isinstance(value, np.generic):
+        return {
+            "type": "numpy_scalar",
+            "dtype": value.dtype.name,
+        }
+    if isinstance(value, bytes):
+        return {"type": "bytes"}
+    elif value is None:
+        return {"type": "NoneType"}
+    elif isinstance(value, bool):
+        return {"type": "bool"}
+    elif isinstance(value, int):
+        return {"type": "int"}
+    elif isinstance(value, float):
+        return {"type": "float"}
+    elif isinstance(value, str):
+        return {"type": "str"}
+    elif isinstance(value, list):
+        return {"type": "list"}
+    elif isinstance(value, dict):
+        return {"type": "dict"}
+    else:
+        # Fallback for unknown types
+        return {"type": type(value).__name__}