PyPI - flatdata-py - Versions diffs - 0.4.10__tar.gz → 0.4.12__tar.gz - Mend

flatdata-py 0.4.10tar.gz → 0.4.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{flatdata_py-0.4.10 → flatdata_py-0.4.12}/.gitignore RENAMED Viewed

@@ -4,10 +4,6 @@ build
 venv*
 **/.vscode/**
 **/.idea/**
-**/*_generated.go
-**/coverage.out
-**/flatdata-fuzz.zip
-**/corpus/**
 **/dist/**
 *.egg-info
 .DS_Store

{flatdata_py-0.4.10 → flatdata_py-0.4.12}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,14 @@
 Metadata-Version: 2.4
 Name: flatdata-py
-Version: 0.4.10
+Version: 0.4.12
 Summary: Python 3 implementation of Flatdata
 Project-URL: Homepage, https://github.com/heremaps/flatdata
 Author: Flatdata Developers
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Requires-Dist: flatdata-generator==0.4.10
+Requires-Python: >=3.8
+Requires-Dist: flatdata-generator==0.4.12
 Requires-Dist: numpy
 Requires-Dist: pandas
 Provides-Extra: inspector
@@ -34,6 +35,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
 flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
 ```
+## Performance tips
+`flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
+Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
+```python
+count = sum(1 for x in archive.links if x.speed_limit > 100)
+```
+For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
+```python
+# single column access, returns a pandas DataFrame
+df = archive.links.speed_limit
+count = len(df[df['speed_limit'] > 100])
+# full NumPy structured array with all fields
+arr = archive.links.to_numpy()
+count = int(np.sum(arr['speed_limit'] > 100))
+# slices work too
+arr = archive.links[1000:2000].to_numpy()
+df = archive.links[::10].to_data_frame()
+```
+* Use `vector.field_name` (column access) when you only need one or a few fields.
+* Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
+* Use `vector[i].field` for random access to individual elements.
+* The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
 ## Using the inspector
 `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:

{flatdata_py-0.4.10 → flatdata_py-0.4.12}/README.md RENAMED Viewed

@@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
 flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
 ```
+## Performance tips
+`flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
+Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
+```python
+count = sum(1 for x in archive.links if x.speed_limit > 100)
+```
+For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
+```python
+# single column access, returns a pandas DataFrame
+df = archive.links.speed_limit
+count = len(df[df['speed_limit'] > 100])
+# full NumPy structured array with all fields
+arr = archive.links.to_numpy()
+count = int(np.sum(arr['speed_limit'] > 100))
+# slices work too
+arr = archive.links[1000:2000].to_numpy()
+df = archive.links[::10].to_data_frame()
+```
+* Use `vector.field_name` (column access) when you only need one or a few fields.
+* Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
+* Use `vector[i].field` for random access to individual elements.
+* The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
 ## Using the inspector
 `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:

{flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/archive.py RENAMED Viewed

@@ -3,17 +3,27 @@
  See the LICENSE file in the root of this project for license details.
 '''
-from collections import namedtuple
+from __future__ import annotations
+from typing import Any, NamedTuple, TYPE_CHECKING
 import pandas as pd
 from .errors import MissingResourceError, SchemaMismatchError
-ResourceSignature = namedtuple("ResourceSignature",
-                               ["container", "initializer", "schema", "is_optional", "doc"])
+if TYPE_CHECKING:
+    from .resources import ReadStorage, ResourceBase
+class ResourceSignature(NamedTuple):
+    container: type[ResourceBase] | type[Archive]
+    initializer: Any
+    schema: str
+    is_optional: bool
+    doc: str
-def _is_archive_signature(resource_signature):
-    return resource_signature.container == Archive
+def _is_archive_signature(resource_signature: ResourceSignature) -> bool:
+    return bool(resource_signature.container == Archive)
 _SCHEMA_EXT = ".schema"
@@ -23,35 +33,38 @@ class Archive:
     Archive class. Entry point to Flatdata.
     Provides access to flatdata resources and verifies archive/resource schemas on opening.
     """
+    _NAME: str
+    _SCHEMA: str
+    _RESOURCES: dict[str, ResourceSignature]
-    def __init__(self, resource_storage):
+    def __init__(self, resource_storage: ReadStorage) -> None:
         """
         Opens archive from a given resource storage.
         :raises flatdata.errors.CorruptArchiveError
         :raises flatdata.errors.SchemaMismatchError
         :param resource_storage: Resource storage to use.
         """
-        self._resource_storage = resource_storage
-        self._loaded_resources = {}
+        self._resource_storage: ReadStorage = resource_storage
+        self._loaded_resources: dict[str, Any] = {}
         # Preload resources and check their schemas
         for name, _ in sorted(list(self._RESOURCES.items())):
             self.__getattr__(name)
-    def __getattr__(self, name):
-        if name not in list(self._RESOURCES.keys()):
+    def __getattr__(self, name: str) -> Any:
+        if name not in self._RESOURCES:
             raise AttributeError("Resource %s not defined in archive." % name)
-        if name not in list(self._loaded_resources.keys()):
+        if name not in self._loaded_resources:
             self._loaded_resources[name] = self._open_resource(name)
         return self._loaded_resources[name]
-    def __dir__(self):
+    def __dir__(self) -> list[str]:
         return list(self._RESOURCES.keys()) + ['schema']
-    def __repr__(self):
-        return self.to_data_frame().__repr__()
+    def __repr__(self) -> str:
+        return repr(self.to_data_frame())
-    def to_data_frame(self):
+    def to_data_frame(self) -> pd.DataFrame:
         result = []
         for name, signature in self._RESOURCES.items():
             resource = self.__getattr__(name)
@@ -62,34 +75,34 @@ class Archive:
                             columns=["Name", "Type", "Optional", "SizeInBytes", "Size"])
     @classmethod
-    def name(cls):
+    def name(cls) -> str:
         return cls._NAME
     @classmethod
-    def schema(cls):
+    def schema(cls) -> str:
         return cls._SCHEMA
     @classmethod
-    def resource_schema(cls, resource):
-        return cls._RESOURCES[resource].schema
+    def resource_schema(cls, resource: str) -> str:
+        return str(cls._RESOURCES[resource].schema)
     @classmethod
-    def open(cls, storage, name, initializer, is_optional=False):
+    def open(cls, storage: ReadStorage, name: str, initializer: type[Archive], is_optional: bool = False) -> Archive | None:
         nested_storage = storage.get(name, is_optional)
         assert nested_storage is not None or is_optional
         if nested_storage is None:
             return None
         return initializer(nested_storage)
-    def size_in_bytes(self):
+    def size_in_bytes(self) -> int:
         return sum(resource_value.size_in_bytes() for resource_value in
                    (self.__getattr__(resource) for resource in self._RESOURCES.keys())
                    if resource_value)
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._RESOURCES)
-    def _schema_validated_resource_signature(self, name):
+    def _schema_validated_resource_signature(self, name: str) -> ResourceSignature | None:
         resource_signature = self._RESOURCES[name]
         # We check only schema for non-subarchives, since the subarchives schema is checked,
         # when it is initialized.
@@ -103,7 +116,7 @@ class Archive:
                 return None
         return resource_signature
-    def _open_resource(self, name):
+    def _open_resource(self, name: str) -> Any:
         resource_signature = self._schema_validated_resource_signature(name)
         if resource_signature:
             resource = resource_signature.container.open(storage=self._resource_storage,
@@ -116,7 +129,7 @@ class Archive:
         return None
     @staticmethod
-    def _check_non_subarchive_schema(name, resource_signature, storage):
+    def _check_non_subarchive_schema(name: str, resource_signature: ResourceSignature, storage: Any) -> None:
         actual_schema = bytes(storage).decode()
         if actual_schema != resource_signature.schema:
             raise SchemaMismatchError(

{flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/archive_builder.py RENAMED Viewed

@@ -3,8 +3,10 @@
  See the LICENSE file in the root of this project for license details.
 '''
-from collections import namedtuple
+from __future__ import annotations
 import os
+from typing import Any, NamedTuple, Protocol, TYPE_CHECKING
 from .errors import IndexWriterError, MissingFieldError, UnknownFieldError, \
     UnknownStructureError, UnknownResourceError, ResourceAlreadySetError
@@ -12,10 +14,24 @@ from .errors import IndexWriterError, MissingFieldError, UnknownFieldError, \
 from .resources import Instance, Vector, Multivector, RawData
 from .data_access import write_value
+if TYPE_CHECKING:
+    from .resource_storage import _Resource
+    from .structure import Structure
 _SCHEMA_EXT = ".schema"
-ResourceSignature = namedtuple("ResourceSignature",
-                               ["container", "initializer", "schema", "is_optional", "doc"])
+class ResourceSignature(NamedTuple):
+    container: type
+    initializer: Any
+    schema: str
+    is_optional: bool
+    doc: str
+class WriteStorage(Protocol):
+    def get(self, resource_name: str, is_subarchive: bool = False) -> Any: ...
+    def close(self) -> None: ...
 class IndexWriter:
@@ -23,7 +39,7 @@ class IndexWriter:
     IndexWriter class. Only applicable when multivector is present in archive schema.
     """
-    def __init__(self, name, size, resource_storage):
+    def __init__(self, name: str, size: int, resource_storage: WriteStorage) -> None:
         """
         Create IndexWriter class.
@@ -36,9 +52,9 @@ class IndexWriter:
         self._name = name
         self._index_size = size
-        self._fout = resource_storage.get(f'{self._name}_index', False)
+        self._fout: _Resource = resource_storage.get(f'{self._name}_index', False)
-    def add(self, index):
+    def add(self, index: int) -> None:
         """
         Convert index(number) to bytearray and add to in memory store
         """
@@ -46,7 +62,7 @@ class IndexWriter:
                                           byteorder="little", signed=False)
         self._fout.write(index_bytes)
-    def finish(self):
+    def finish(self) -> None:
         """
         Complete index resource by adding size and padding followed by writing to file
         """
@@ -60,30 +76,33 @@ class ArchiveBuilder:
     ArchiveBuilder class. Entry point to writing Flatdata.
     Provides methods to create flatdata archives.
     """
+    _NAME: str
+    _SCHEMA: str
+    _RESOURCES: dict[str, ResourceSignature]
-    def __init__(self, resource_storage, path=""):
+    def __init__(self, resource_storage: WriteStorage, path: str = "") -> None:
         """
         Opens archive from a given resource writer.
         :param resource_storage: storage manager to store and write to disc
         :param path: file path where archive is created
         """
         self._path = os.path.join(path, self._NAME)
-        self._resource_storage = resource_storage
+        self._resource_storage: WriteStorage = resource_storage
         self._write_archive_signature()
         self._write_archive_schema()
         self._resources_written = [f"{self._NAME}.archive"]
     @classmethod
-    def name(cls):
+    def name(cls) -> str:
         '''Returns archive name'''
         return cls._NAME
     @classmethod
-    def schema(cls):
+    def schema(cls) -> str:
         '''Returns archive schema'''
         return cls._SCHEMA
-    def _write_raw_data(self, name, data):
+    def _write_raw_data(self, name: str, data: bytes | bytearray) -> None:
         '''
         Helper function to write data
@@ -94,7 +113,7 @@ class ArchiveBuilder:
         storage.write(data)
         storage.close()
-    def _write_schema(self, name):
+    def _write_schema(self, name: str) -> None:
         '''
         Writes resource schema
@@ -103,29 +122,29 @@ class ArchiveBuilder:
         self._write_raw_data(f"{name}.schema", bytes(
             self._RESOURCES[name].schema, 'utf-8'))
-    def _write_archive_signature(self):
+    def _write_archive_signature(self) -> None:
         '''Writes archive's signature'''
         self._write_raw_data(f"{self._NAME}.archive", b'\x00' * 16)
-    def _write_archive_schema(self):
+    def _write_archive_schema(self) -> None:
         '''Writes archive schema'''
         self._write_raw_data(
             f"{self._NAME}.archive.schema", bytes(self._SCHEMA, 'utf-8'))
-    def _write_index_schema(self, resource_name, schema):
+    def _write_index_schema(self, resource_name: str, schema: str) -> None:
         self._write_raw_data(
             f"{resource_name}_index.schema", bytes(schema, 'utf-8'))
-    def subarchive(self, name):
+    def subarchive(self, name: str) -> 'ArchiveBuilder':
         """
         Returns an archive builder for the sub-archive `name`.
         :raises $name_not_subarchive_error
         :param name: name of the sub-archive
         """
-        NotImplemented
+        raise NotImplementedError(f"subarchive '{name}' is not implemented")
     @classmethod
-    def __validate_structure_fields(cls, name, struct, initializer):
+    def __validate_structure_fields(cls, name: str, struct: dict[str, Any], initializer: type[Structure]) -> None:
         '''
         Validates whether passed object has all required fields
@@ -142,7 +161,7 @@ class ArchiveBuilder:
             if key not in initializer._FIELD_KEYS:
                 raise UnknownFieldError(key, name)
-    def __set_instance(self, storage, name, value):
+    def __set_instance(self, storage: _Resource, name: str, value: dict[str, Any]) -> None:
         '''
         Creates and writes instance type resource
@@ -160,7 +179,7 @@ class ArchiveBuilder:
         storage.write(bout)
-    def __set_vector(self, storage, name, vector):
+    def __set_vector(self, storage: _Resource, name: str, vector: list[dict[str, Any]]) -> None:
         '''
         Creates and writes vector resource
@@ -179,7 +198,7 @@ class ArchiveBuilder:
                             field.is_signed, value[key])
             storage.write(bout)
-    def __set_multivector(self, storage, name, value):
+    def __set_multivector(self, storage: _Resource, name: str, value: list[list[dict[str, Any]]]) -> None:
         '''
         Creates and writes multivector resource
@@ -193,10 +212,10 @@ class ArchiveBuilder:
         for index, obj_type in enumerate(initializer_list[1:]):
             initializers[obj_type._NAME] = (index, obj_type)
-        def valid_structure_name(_obj):
+        def valid_structure_name(_obj: dict[str, Any]) -> bool:
             return _obj['name'] in [_initializer._NAME for _initializer in initializer_list[1:]]
-        def validate_fields(_obj):
+        def validate_fields(_obj: dict[str, Any]) -> None:
             matched_obj_list = [
                 _initializer for _initializer in initializer_list[1:] \
                     if _initializer._NAME == _obj['name']]
@@ -248,7 +267,7 @@ class ArchiveBuilder:
         self._resources_written.append(name)
         self._resources_written.append(f'{name}_index')
-    def set(self, name, value):
+    def set(self, name: str, value: Any) -> None:
         """
         Write a resource for this archive at once.
         Can only be done once. `set` and `start` can't be used for the same resource.
@@ -284,7 +303,7 @@ class ArchiveBuilder:
         self._resources_written.append(name)
-    def finish(self):
+    def finish(self) -> None:
         """
         Closes the storage manager
         """

flatdata_py-0.4.12/flatdata/lib/data_access.py ADDED Viewed

@@ -0,0 +1,177 @@
+'''
+ Copyright (c) 2017 HERE Europe B.V.
+ See the LICENSE file in the root of this project for license details.
+'''
+from __future__ import annotations
+import mmap
+from collections.abc import Callable
+from typing import Union
+import numpy as np
+from numpy.typing import NDArray
+ReadableBuffer = Union[bytes, bytearray, memoryview, mmap.mmap]
+# Sign bits cache for the value reading.
+_SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
+def make_field_reader(offset_bits: int, num_bits: int, is_signed: bool) -> Callable[[ReadableBuffer, int], int]:
+    """Build a specialized closure for reading a single field from a structure.
+    Returns a function reader(data, pos_bytes) that reads the field value
+    from ``data`` at byte position ``pos_bytes``.  All constants (byte offset,
+    bit shift, mask, sign handling) are pre-computed and captured by the
+    closure so the hot path does minimal work.
+    """
+    offset_bytes, offset_extra = divmod(offset_bits, 8)
+    total_bytes = (num_bits + 7) // 8
+    end_byte = offset_bytes + total_bytes
+    mask = (1 << num_bits) - 1
+    needs_extra = (total_bytes * 8 - offset_extra) < num_bits
+    extra_shift = total_bytes * 8 - offset_extra
+    if num_bits == 1:
+        bit_mask = 1 << offset_extra
+        def reader(data: ReadableBuffer, pos: int) -> int:
+            return int((data[pos + offset_bytes] & bit_mask) != 0)
+        return reader
+    if is_signed:
+        sign_bit = _SIGN_BITS[num_bits]
+        sign_mask = sign_bit - 1
+        if needs_extra:
+            def reader(data: ReadableBuffer, pos: int) -> int:
+                result = int.from_bytes(
+                    data[pos + offset_bytes: pos + end_byte], byteorder="little")
+                result >>= offset_extra
+                result |= data[pos + end_byte] << extra_shift
+                result &= mask
+                return int((result & sign_mask) - (result & sign_bit))
+        elif offset_extra:
+            def reader(data: ReadableBuffer, pos: int) -> int:
+                result = (int.from_bytes(
+                    data[pos + offset_bytes: pos + end_byte],
+                    byteorder="little") >> offset_extra) & mask
+                return (result & sign_mask) - (result & sign_bit)
+        else:
+            def reader(data: ReadableBuffer, pos: int) -> int:
+                result = int.from_bytes(
+                    data[pos + offset_bytes: pos + end_byte],
+                    byteorder="little") & mask
+                return (result & sign_mask) - (result & sign_bit)
+        return reader
+    # Unsigned paths
+    if needs_extra:
+        def reader(data: ReadableBuffer, pos: int) -> int:
+            result = int.from_bytes(
+                data[pos + offset_bytes: pos + end_byte], byteorder="little")
+            result >>= offset_extra
+            result |= data[pos + end_byte] << extra_shift
+            return int(result & mask)
+    elif offset_extra:
+        def reader(data: ReadableBuffer, pos: int) -> int:
+            return (int.from_bytes(
+                data[pos + offset_bytes: pos + end_byte],
+                byteorder="little") >> offset_extra) & mask
+    else:
+        def reader(data: ReadableBuffer, pos: int) -> int:
+            return int.from_bytes(
+                data[pos + offset_bytes: pos + end_byte],
+                byteorder="little") & mask
+    return reader
+def read_field_vectorized(raw_bytes_2d: NDArray[np.uint8], field_offset_bits: int, field_width_bits: int, is_signed: bool) -> NDArray[np.uint64] | NDArray[np.int64]:
+    """Read a bit-packed field from all elements at once, returning a numpy array.
+    :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
+    :param field_offset_bits: bit offset of the field within each element
+    :param field_width_bits: width of the field in bits (max 64)
+    :param is_signed: whether to sign-extend the result
+    :return: numpy array of field values
+    """
+    if field_width_bits == 1:
+        byte_idx = field_offset_bits // 8
+        bit_idx = field_offset_bits % 8
+        return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
+                np.uint64(1))
+    byte_start = field_offset_bits // 8
+    bit_shift = field_offset_bits % 8
+    bytes_needed = (bit_shift + field_width_bits + 7) // 8
+    # Use Python int arithmetic for the shift to avoid numpy overflow,
+    # then broadcast back to the array.
+    result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
+    for b in range(min(bytes_needed, 8)):
+        result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
+    result >>= np.uint64(bit_shift)
+    # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
+    bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
+    if bits_so_far < field_width_bits and bytes_needed > 8:
+        extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
+        result |= extra << np.uint64(bits_so_far)
+    if field_width_bits < 64:
+        result &= np.uint64((1 << field_width_bits) - 1)
+    if is_signed:
+        if field_width_bits == 64:
+            return result.view(np.int64)
+        sign_bit = np.uint64(1 << (field_width_bits - 1))
+        offset = -(1 << field_width_bits)
+        signed = result.astype(np.int64) + np.int64(offset)
+        result = np.where(result & sign_bit, signed, result.astype(np.int64))  # type: ignore[assignment, unused-ignore]
+    return result
+def read_value(data: ReadableBuffer, offset_bits: int, num_bits: int, is_signed: bool) -> int:
+    """Read a bit-packed value from data at the given bit offset.
+    This is a convenience wrapper around :func:`make_field_reader` for one-off
+    reads.  For repeated reads of the same field, prefer building a reader once
+    with ``make_field_reader`` and reusing it.
+    """
+    reader = make_field_reader(offset_bits, num_bits, is_signed)
+    return reader(data, 0)
+def write_value(data: bytearray, offset_bits: int, num_bits: int, is_signed: bool, value: int) -> None:
+    assert num_bits <= 64, f'Number of bits to write is greater than 64'
+    offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
+    total_bytes = (num_bits + 7) // 8
+    if num_bits == 1:
+        if value == 1:
+            data[offset_bytes] |= 1 << offset_extra_bits
+        else:
+            data[offset_bytes] &= ~(1 << offset_extra_bits)
+        return
+    mask = (1 << num_bits) - 1
+    value <<= offset_extra_bits
+    value &= mask << offset_extra_bits
+    value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
+    surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
+    byte_idx = 0
+    data[offset_bytes] = value_in_little_endian[byte_idx]
+    data[offset_bytes] |= surrounding_bits
+    byte_idx += 1
+    while byte_idx < total_bytes:
+        data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
+        byte_idx += 1
+    bits_written = total_bytes * 8 - offset_extra_bits
+    if bits_written < num_bits:
+        surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
+        data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
+        data[offset_bytes + byte_idx] |= surrounding_bits

flatdata-py 0.4.10__tar.gz → 0.4.12__tar.gz

flatdata-py 0.4.10tar.gz → 0.4.12tar.gz