flatdata-py 0.4.10__tar.gz → 0.4.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/.gitignore +0 -4
  2. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/PKG-INFO +34 -2
  3. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/README.md +31 -0
  4. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/archive.py +38 -25
  5. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/archive_builder.py +45 -26
  6. flatdata_py-0.4.12/flatdata/lib/data_access.py +177 -0
  7. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/errors.py +15 -13
  8. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/file_resource_storage.py +7 -5
  9. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/file_resource_writer.py +12 -6
  10. flatdata_py-0.4.12/flatdata/lib/flatdata_writer.py +106 -0
  11. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/inspector.py +14 -7
  12. flatdata_py-0.4.12/flatdata/lib/py.typed +0 -0
  13. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/resource_storage.py +42 -20
  14. flatdata_py-0.4.12/flatdata/lib/resources.py +305 -0
  15. flatdata_py-0.4.12/flatdata/lib/structure.py +99 -0
  16. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/tar_archive_resource_storage.py +7 -5
  17. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/writer.py +2 -5
  18. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/pyproject.toml +21 -2
  19. flatdata_py-0.4.10/flatdata/lib/data_access.py +0 -64
  20. flatdata_py-0.4.10/flatdata/lib/flatdata_writer.py +0 -72
  21. flatdata_py-0.4.10/flatdata/lib/resources.py +0 -239
  22. flatdata_py-0.4.10/flatdata/lib/structure.py +0 -78
  23. {flatdata_py-0.4.10 → flatdata_py-0.4.12}/flatdata/lib/__init__.py +0 -0
@@ -4,10 +4,6 @@ build
4
4
  venv*
5
5
  **/.vscode/**
6
6
  **/.idea/**
7
- **/*_generated.go
8
- **/coverage.out
9
- **/flatdata-fuzz.zip
10
- **/corpus/**
11
7
  **/dist/**
12
8
  *.egg-info
13
9
  .DS_Store
@@ -1,13 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: flatdata-py
3
- Version: 0.4.10
3
+ Version: 0.4.12
4
4
  Summary: Python 3 implementation of Flatdata
5
5
  Project-URL: Homepage, https://github.com/heremaps/flatdata
6
6
  Author: Flatdata Developers
7
7
  Classifier: License :: OSI Approved :: Apache Software License
8
8
  Classifier: Operating System :: OS Independent
9
9
  Classifier: Programming Language :: Python :: 3
10
- Requires-Dist: flatdata-generator==0.4.10
10
+ Requires-Python: >=3.8
11
+ Requires-Dist: flatdata-generator==0.4.12
11
12
  Requires-Dist: numpy
12
13
  Requires-Dist: pandas
13
14
  Provides-Extra: inspector
@@ -34,6 +35,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
34
35
  flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
35
36
  ```
36
37
 
38
+ ## Performance tips
39
+
40
+ `flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
41
+
42
+ Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
43
+
44
+ ```python
45
+ count = sum(1 for x in archive.links if x.speed_limit > 100)
46
+ ```
47
+
48
+ For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
49
+
50
+ ```python
51
+ # single column access, returns a pandas DataFrame
52
+ df = archive.links.speed_limit
53
+ count = len(df[df['speed_limit'] > 100])
54
+
55
+ # full NumPy structured array with all fields
56
+ arr = archive.links.to_numpy()
57
+ count = int(np.sum(arr['speed_limit'] > 100))
58
+
59
+ # slices work too
60
+ arr = archive.links[1000:2000].to_numpy()
61
+ df = archive.links[::10].to_data_frame()
62
+ ```
63
+
64
+ * Use `vector.field_name` (column access) when you only need one or a few fields.
65
+ * Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
66
+ * Use `vector[i].field` for random access to individual elements.
67
+ * The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
68
+
37
69
  ## Using the inspector
38
70
 
39
71
  `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:
@@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
18
18
  flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
19
19
  ```
20
20
 
21
+ ## Performance tips
22
+
23
+ `flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
24
+
25
+ Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
26
+
27
+ ```python
28
+ count = sum(1 for x in archive.links if x.speed_limit > 100)
29
+ ```
30
+
31
+ For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
32
+
33
+ ```python
34
+ # single column access, returns a pandas DataFrame
35
+ df = archive.links.speed_limit
36
+ count = len(df[df['speed_limit'] > 100])
37
+
38
+ # full NumPy structured array with all fields
39
+ arr = archive.links.to_numpy()
40
+ count = int(np.sum(arr['speed_limit'] > 100))
41
+
42
+ # slices work too
43
+ arr = archive.links[1000:2000].to_numpy()
44
+ df = archive.links[::10].to_data_frame()
45
+ ```
46
+
47
+ * Use `vector.field_name` (column access) when you only need one or a few fields.
48
+ * Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
49
+ * Use `vector[i].field` for random access to individual elements.
50
+ * The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
51
+
21
52
  ## Using the inspector
22
53
 
23
54
  `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:
@@ -3,17 +3,27 @@
3
3
  See the LICENSE file in the root of this project for license details.
4
4
  '''
5
5
 
6
- from collections import namedtuple
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, NamedTuple, TYPE_CHECKING
7
9
 
8
10
  import pandas as pd
9
11
 
10
12
  from .errors import MissingResourceError, SchemaMismatchError
11
13
 
12
- ResourceSignature = namedtuple("ResourceSignature",
13
- ["container", "initializer", "schema", "is_optional", "doc"])
14
+ if TYPE_CHECKING:
15
+ from .resources import ReadStorage, ResourceBase
16
+
17
+
18
+ class ResourceSignature(NamedTuple):
19
+ container: type[ResourceBase] | type[Archive]
20
+ initializer: Any
21
+ schema: str
22
+ is_optional: bool
23
+ doc: str
14
24
 
15
- def _is_archive_signature(resource_signature):
16
- return resource_signature.container == Archive
25
+ def _is_archive_signature(resource_signature: ResourceSignature) -> bool:
26
+ return bool(resource_signature.container == Archive)
17
27
 
18
28
  _SCHEMA_EXT = ".schema"
19
29
 
@@ -23,35 +33,38 @@ class Archive:
23
33
  Archive class. Entry point to Flatdata.
24
34
  Provides access to flatdata resources and verifies archive/resource schemas on opening.
25
35
  """
36
+ _NAME: str
37
+ _SCHEMA: str
38
+ _RESOURCES: dict[str, ResourceSignature]
26
39
 
27
- def __init__(self, resource_storage):
40
+ def __init__(self, resource_storage: ReadStorage) -> None:
28
41
  """
29
42
  Opens archive from a given resource storage.
30
43
  :raises flatdata.errors.CorruptArchiveError
31
44
  :raises flatdata.errors.SchemaMismatchError
32
45
  :param resource_storage: Resource storage to use.
33
46
  """
34
- self._resource_storage = resource_storage
35
- self._loaded_resources = {}
47
+ self._resource_storage: ReadStorage = resource_storage
48
+ self._loaded_resources: dict[str, Any] = {}
36
49
 
37
50
  # Preload resources and check their schemas
38
51
  for name, _ in sorted(list(self._RESOURCES.items())):
39
52
  self.__getattr__(name)
40
53
 
41
- def __getattr__(self, name):
42
- if name not in list(self._RESOURCES.keys()):
54
+ def __getattr__(self, name: str) -> Any:
55
+ if name not in self._RESOURCES:
43
56
  raise AttributeError("Resource %s not defined in archive." % name)
44
- if name not in list(self._loaded_resources.keys()):
57
+ if name not in self._loaded_resources:
45
58
  self._loaded_resources[name] = self._open_resource(name)
46
59
  return self._loaded_resources[name]
47
60
 
48
- def __dir__(self):
61
+ def __dir__(self) -> list[str]:
49
62
  return list(self._RESOURCES.keys()) + ['schema']
50
63
 
51
- def __repr__(self):
52
- return self.to_data_frame().__repr__()
64
+ def __repr__(self) -> str:
65
+ return repr(self.to_data_frame())
53
66
 
54
- def to_data_frame(self):
67
+ def to_data_frame(self) -> pd.DataFrame:
55
68
  result = []
56
69
  for name, signature in self._RESOURCES.items():
57
70
  resource = self.__getattr__(name)
@@ -62,34 +75,34 @@ class Archive:
62
75
  columns=["Name", "Type", "Optional", "SizeInBytes", "Size"])
63
76
 
64
77
  @classmethod
65
- def name(cls):
78
+ def name(cls) -> str:
66
79
  return cls._NAME
67
80
 
68
81
  @classmethod
69
- def schema(cls):
82
+ def schema(cls) -> str:
70
83
  return cls._SCHEMA
71
84
 
72
85
  @classmethod
73
- def resource_schema(cls, resource):
74
- return cls._RESOURCES[resource].schema
86
+ def resource_schema(cls, resource: str) -> str:
87
+ return str(cls._RESOURCES[resource].schema)
75
88
 
76
89
  @classmethod
77
- def open(cls, storage, name, initializer, is_optional=False):
90
+ def open(cls, storage: ReadStorage, name: str, initializer: type[Archive], is_optional: bool = False) -> Archive | None:
78
91
  nested_storage = storage.get(name, is_optional)
79
92
  assert nested_storage is not None or is_optional
80
93
  if nested_storage is None:
81
94
  return None
82
95
  return initializer(nested_storage)
83
96
 
84
- def size_in_bytes(self):
97
+ def size_in_bytes(self) -> int:
85
98
  return sum(resource_value.size_in_bytes() for resource_value in
86
99
  (self.__getattr__(resource) for resource in self._RESOURCES.keys())
87
100
  if resource_value)
88
101
 
89
- def __len__(self):
102
+ def __len__(self) -> int:
90
103
  return len(self._RESOURCES)
91
104
 
92
- def _schema_validated_resource_signature(self, name):
105
+ def _schema_validated_resource_signature(self, name: str) -> ResourceSignature | None:
93
106
  resource_signature = self._RESOURCES[name]
94
107
  # We check only schema for non-subarchives, since the subarchives schema is checked,
95
108
  # when it is initialized.
@@ -103,7 +116,7 @@ class Archive:
103
116
  return None
104
117
  return resource_signature
105
118
 
106
- def _open_resource(self, name):
119
+ def _open_resource(self, name: str) -> Any:
107
120
  resource_signature = self._schema_validated_resource_signature(name)
108
121
  if resource_signature:
109
122
  resource = resource_signature.container.open(storage=self._resource_storage,
@@ -116,7 +129,7 @@ class Archive:
116
129
  return None
117
130
 
118
131
  @staticmethod
119
- def _check_non_subarchive_schema(name, resource_signature, storage):
132
+ def _check_non_subarchive_schema(name: str, resource_signature: ResourceSignature, storage: Any) -> None:
120
133
  actual_schema = bytes(storage).decode()
121
134
  if actual_schema != resource_signature.schema:
122
135
  raise SchemaMismatchError(
@@ -3,8 +3,10 @@
3
3
  See the LICENSE file in the root of this project for license details.
4
4
  '''
5
5
 
6
- from collections import namedtuple
6
+ from __future__ import annotations
7
+
7
8
  import os
9
+ from typing import Any, NamedTuple, Protocol, TYPE_CHECKING
8
10
 
9
11
  from .errors import IndexWriterError, MissingFieldError, UnknownFieldError, \
10
12
  UnknownStructureError, UnknownResourceError, ResourceAlreadySetError
@@ -12,10 +14,24 @@ from .errors import IndexWriterError, MissingFieldError, UnknownFieldError, \
12
14
  from .resources import Instance, Vector, Multivector, RawData
13
15
  from .data_access import write_value
14
16
 
17
+ if TYPE_CHECKING:
18
+ from .resource_storage import _Resource
19
+ from .structure import Structure
20
+
15
21
  _SCHEMA_EXT = ".schema"
16
22
 
17
- ResourceSignature = namedtuple("ResourceSignature",
18
- ["container", "initializer", "schema", "is_optional", "doc"])
23
+
24
+ class ResourceSignature(NamedTuple):
25
+ container: type
26
+ initializer: Any
27
+ schema: str
28
+ is_optional: bool
29
+ doc: str
30
+
31
+
32
+ class WriteStorage(Protocol):
33
+ def get(self, resource_name: str, is_subarchive: bool = False) -> Any: ...
34
+ def close(self) -> None: ...
19
35
 
20
36
 
21
37
  class IndexWriter:
@@ -23,7 +39,7 @@ class IndexWriter:
23
39
  IndexWriter class. Only applicable when multivector is present in archive schema.
24
40
  """
25
41
 
26
- def __init__(self, name, size, resource_storage):
42
+ def __init__(self, name: str, size: int, resource_storage: WriteStorage) -> None:
27
43
  """
28
44
  Create IndexWriter class.
29
45
 
@@ -36,9 +52,9 @@ class IndexWriter:
36
52
 
37
53
  self._name = name
38
54
  self._index_size = size
39
- self._fout = resource_storage.get(f'{self._name}_index', False)
55
+ self._fout: _Resource = resource_storage.get(f'{self._name}_index', False)
40
56
 
41
- def add(self, index):
57
+ def add(self, index: int) -> None:
42
58
  """
43
59
  Convert index(number) to bytearray and add to in memory store
44
60
  """
@@ -46,7 +62,7 @@ class IndexWriter:
46
62
  byteorder="little", signed=False)
47
63
  self._fout.write(index_bytes)
48
64
 
49
- def finish(self):
65
+ def finish(self) -> None:
50
66
  """
51
67
  Complete index resource by adding size and padding followed by writing to file
52
68
  """
@@ -60,30 +76,33 @@ class ArchiveBuilder:
60
76
  ArchiveBuilder class. Entry point to writing Flatdata.
61
77
  Provides methods to create flatdata archives.
62
78
  """
79
+ _NAME: str
80
+ _SCHEMA: str
81
+ _RESOURCES: dict[str, ResourceSignature]
63
82
 
64
- def __init__(self, resource_storage, path=""):
83
+ def __init__(self, resource_storage: WriteStorage, path: str = "") -> None:
65
84
  """
66
85
  Opens archive from a given resource writer.
67
86
  :param resource_storage: storage manager to store and write to disc
68
87
  :param path: file path where archive is created
69
88
  """
70
89
  self._path = os.path.join(path, self._NAME)
71
- self._resource_storage = resource_storage
90
+ self._resource_storage: WriteStorage = resource_storage
72
91
  self._write_archive_signature()
73
92
  self._write_archive_schema()
74
93
  self._resources_written = [f"{self._NAME}.archive"]
75
94
 
76
95
  @classmethod
77
- def name(cls):
96
+ def name(cls) -> str:
78
97
  '''Returns archive name'''
79
98
  return cls._NAME
80
99
 
81
100
  @classmethod
82
- def schema(cls):
101
+ def schema(cls) -> str:
83
102
  '''Returns archive schema'''
84
103
  return cls._SCHEMA
85
104
 
86
- def _write_raw_data(self, name, data):
105
+ def _write_raw_data(self, name: str, data: bytes | bytearray) -> None:
87
106
  '''
88
107
  Helper function to write data
89
108
 
@@ -94,7 +113,7 @@ class ArchiveBuilder:
94
113
  storage.write(data)
95
114
  storage.close()
96
115
 
97
- def _write_schema(self, name):
116
+ def _write_schema(self, name: str) -> None:
98
117
  '''
99
118
  Writes resource schema
100
119
 
@@ -103,29 +122,29 @@ class ArchiveBuilder:
103
122
  self._write_raw_data(f"{name}.schema", bytes(
104
123
  self._RESOURCES[name].schema, 'utf-8'))
105
124
 
106
- def _write_archive_signature(self):
125
+ def _write_archive_signature(self) -> None:
107
126
  '''Writes archive's signature'''
108
127
  self._write_raw_data(f"{self._NAME}.archive", b'\x00' * 16)
109
128
 
110
- def _write_archive_schema(self):
129
+ def _write_archive_schema(self) -> None:
111
130
  '''Writes archive schema'''
112
131
  self._write_raw_data(
113
132
  f"{self._NAME}.archive.schema", bytes(self._SCHEMA, 'utf-8'))
114
133
 
115
- def _write_index_schema(self, resource_name, schema):
134
+ def _write_index_schema(self, resource_name: str, schema: str) -> None:
116
135
  self._write_raw_data(
117
136
  f"{resource_name}_index.schema", bytes(schema, 'utf-8'))
118
137
 
119
- def subarchive(self, name):
138
+ def subarchive(self, name: str) -> 'ArchiveBuilder':
120
139
  """
121
140
  Returns an archive builder for the sub-archive `name`.
122
141
  :raises $name_not_subarchive_error
123
142
  :param name: name of the sub-archive
124
143
  """
125
- NotImplemented
144
+ raise NotImplementedError(f"subarchive '{name}' is not implemented")
126
145
 
127
146
  @classmethod
128
- def __validate_structure_fields(cls, name, struct, initializer):
147
+ def __validate_structure_fields(cls, name: str, struct: dict[str, Any], initializer: type[Structure]) -> None:
129
148
  '''
130
149
  Validates whether passed object has all required fields
131
150
 
@@ -142,7 +161,7 @@ class ArchiveBuilder:
142
161
  if key not in initializer._FIELD_KEYS:
143
162
  raise UnknownFieldError(key, name)
144
163
 
145
- def __set_instance(self, storage, name, value):
164
+ def __set_instance(self, storage: _Resource, name: str, value: dict[str, Any]) -> None:
146
165
  '''
147
166
  Creates and writes instance type resource
148
167
 
@@ -160,7 +179,7 @@ class ArchiveBuilder:
160
179
 
161
180
  storage.write(bout)
162
181
 
163
- def __set_vector(self, storage, name, vector):
182
+ def __set_vector(self, storage: _Resource, name: str, vector: list[dict[str, Any]]) -> None:
164
183
  '''
165
184
  Creates and writes vector resource
166
185
 
@@ -179,7 +198,7 @@ class ArchiveBuilder:
179
198
  field.is_signed, value[key])
180
199
  storage.write(bout)
181
200
 
182
- def __set_multivector(self, storage, name, value):
201
+ def __set_multivector(self, storage: _Resource, name: str, value: list[list[dict[str, Any]]]) -> None:
183
202
  '''
184
203
  Creates and writes multivector resource
185
204
 
@@ -193,10 +212,10 @@ class ArchiveBuilder:
193
212
  for index, obj_type in enumerate(initializer_list[1:]):
194
213
  initializers[obj_type._NAME] = (index, obj_type)
195
214
 
196
- def valid_structure_name(_obj):
215
+ def valid_structure_name(_obj: dict[str, Any]) -> bool:
197
216
  return _obj['name'] in [_initializer._NAME for _initializer in initializer_list[1:]]
198
217
 
199
- def validate_fields(_obj):
218
+ def validate_fields(_obj: dict[str, Any]) -> None:
200
219
  matched_obj_list = [
201
220
  _initializer for _initializer in initializer_list[1:] \
202
221
  if _initializer._NAME == _obj['name']]
@@ -248,7 +267,7 @@ class ArchiveBuilder:
248
267
  self._resources_written.append(name)
249
268
  self._resources_written.append(f'{name}_index')
250
269
 
251
- def set(self, name, value):
270
+ def set(self, name: str, value: Any) -> None:
252
271
  """
253
272
  Write a resource for this archive at once.
254
273
  Can only be done once. `set` and `start` can't be used for the same resource.
@@ -284,7 +303,7 @@ class ArchiveBuilder:
284
303
 
285
304
  self._resources_written.append(name)
286
305
 
287
- def finish(self):
306
+ def finish(self) -> None:
288
307
  """
289
308
  Closes the storage manager
290
309
  """
@@ -0,0 +1,177 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ from __future__ import annotations
7
+
8
+ import mmap
9
+ from collections.abc import Callable
10
+ from typing import Union
11
+
12
+ import numpy as np
13
+ from numpy.typing import NDArray
14
+
15
+ ReadableBuffer = Union[bytes, bytearray, memoryview, mmap.mmap]
16
+
17
+ # Sign bits cache for the value reading.
18
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
19
+
20
+
21
+ def make_field_reader(offset_bits: int, num_bits: int, is_signed: bool) -> Callable[[ReadableBuffer, int], int]:
22
+ """Build a specialized closure for reading a single field from a structure.
23
+
24
+ Returns a function reader(data, pos_bytes) that reads the field value
25
+ from ``data`` at byte position ``pos_bytes``. All constants (byte offset,
26
+ bit shift, mask, sign handling) are pre-computed and captured by the
27
+ closure so the hot path does minimal work.
28
+ """
29
+ offset_bytes, offset_extra = divmod(offset_bits, 8)
30
+ total_bytes = (num_bits + 7) // 8
31
+ end_byte = offset_bytes + total_bytes
32
+ mask = (1 << num_bits) - 1
33
+ needs_extra = (total_bytes * 8 - offset_extra) < num_bits
34
+ extra_shift = total_bytes * 8 - offset_extra
35
+
36
+ if num_bits == 1:
37
+ bit_mask = 1 << offset_extra
38
+ def reader(data: ReadableBuffer, pos: int) -> int:
39
+ return int((data[pos + offset_bytes] & bit_mask) != 0)
40
+ return reader
41
+
42
+ if is_signed:
43
+ sign_bit = _SIGN_BITS[num_bits]
44
+ sign_mask = sign_bit - 1
45
+ if needs_extra:
46
+ def reader(data: ReadableBuffer, pos: int) -> int:
47
+ result = int.from_bytes(
48
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
49
+ result >>= offset_extra
50
+ result |= data[pos + end_byte] << extra_shift
51
+ result &= mask
52
+ return int((result & sign_mask) - (result & sign_bit))
53
+ elif offset_extra:
54
+ def reader(data: ReadableBuffer, pos: int) -> int:
55
+ result = (int.from_bytes(
56
+ data[pos + offset_bytes: pos + end_byte],
57
+ byteorder="little") >> offset_extra) & mask
58
+ return (result & sign_mask) - (result & sign_bit)
59
+ else:
60
+ def reader(data: ReadableBuffer, pos: int) -> int:
61
+ result = int.from_bytes(
62
+ data[pos + offset_bytes: pos + end_byte],
63
+ byteorder="little") & mask
64
+ return (result & sign_mask) - (result & sign_bit)
65
+ return reader
66
+
67
+ # Unsigned paths
68
+ if needs_extra:
69
+ def reader(data: ReadableBuffer, pos: int) -> int:
70
+ result = int.from_bytes(
71
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
72
+ result >>= offset_extra
73
+ result |= data[pos + end_byte] << extra_shift
74
+ return int(result & mask)
75
+ elif offset_extra:
76
+ def reader(data: ReadableBuffer, pos: int) -> int:
77
+ return (int.from_bytes(
78
+ data[pos + offset_bytes: pos + end_byte],
79
+ byteorder="little") >> offset_extra) & mask
80
+ else:
81
+ def reader(data: ReadableBuffer, pos: int) -> int:
82
+ return int.from_bytes(
83
+ data[pos + offset_bytes: pos + end_byte],
84
+ byteorder="little") & mask
85
+ return reader
86
+
87
+
88
+ def read_field_vectorized(raw_bytes_2d: NDArray[np.uint8], field_offset_bits: int, field_width_bits: int, is_signed: bool) -> NDArray[np.uint64] | NDArray[np.int64]:
89
+ """Read a bit-packed field from all elements at once, returning a numpy array.
90
+
91
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
92
+ :param field_offset_bits: bit offset of the field within each element
93
+ :param field_width_bits: width of the field in bits (max 64)
94
+ :param is_signed: whether to sign-extend the result
95
+ :return: numpy array of field values
96
+ """
97
+ if field_width_bits == 1:
98
+ byte_idx = field_offset_bits // 8
99
+ bit_idx = field_offset_bits % 8
100
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
101
+ np.uint64(1))
102
+
103
+ byte_start = field_offset_bits // 8
104
+ bit_shift = field_offset_bits % 8
105
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
106
+
107
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
108
+ # then broadcast back to the array.
109
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
110
+ for b in range(min(bytes_needed, 8)):
111
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
112
+ result >>= np.uint64(bit_shift)
113
+
114
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
115
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
116
+ if bits_so_far < field_width_bits and bytes_needed > 8:
117
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
118
+ result |= extra << np.uint64(bits_so_far)
119
+
120
+ if field_width_bits < 64:
121
+ result &= np.uint64((1 << field_width_bits) - 1)
122
+
123
+ if is_signed:
124
+ if field_width_bits == 64:
125
+ return result.view(np.int64)
126
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
127
+ offset = -(1 << field_width_bits)
128
+ signed = result.astype(np.int64) + np.int64(offset)
129
+ result = np.where(result & sign_bit, signed, result.astype(np.int64)) # type: ignore[assignment, unused-ignore]
130
+
131
+ return result
132
+
133
+
134
+ def read_value(data: ReadableBuffer, offset_bits: int, num_bits: int, is_signed: bool) -> int:
135
+ """Read a bit-packed value from data at the given bit offset.
136
+
137
+ This is a convenience wrapper around :func:`make_field_reader` for one-off
138
+ reads. For repeated reads of the same field, prefer building a reader once
139
+ with ``make_field_reader`` and reusing it.
140
+ """
141
+ reader = make_field_reader(offset_bits, num_bits, is_signed)
142
+ return reader(data, 0)
143
+
144
+
145
+ def write_value(data: bytearray, offset_bits: int, num_bits: int, is_signed: bool, value: int) -> None:
146
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
147
+
148
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
149
+ total_bytes = (num_bits + 7) // 8
150
+
151
+ if num_bits == 1:
152
+ if value == 1:
153
+ data[offset_bytes] |= 1 << offset_extra_bits
154
+ else:
155
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
156
+ return
157
+
158
+ mask = (1 << num_bits) - 1
159
+ value <<= offset_extra_bits
160
+ value &= mask << offset_extra_bits
161
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
162
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
163
+
164
+ byte_idx = 0
165
+ data[offset_bytes] = value_in_little_endian[byte_idx]
166
+ data[offset_bytes] |= surrounding_bits
167
+
168
+ byte_idx += 1
169
+ while byte_idx < total_bytes:
170
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
171
+ byte_idx += 1
172
+
173
+ bits_written = total_bytes * 8 - offset_extra_bits
174
+ if bits_written < num_bits:
175
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
176
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
177
+ data[offset_bytes + byte_idx] |= surrounding_bits