flatdata-py 0.4.10__tar.gz → 0.4.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/PKG-INFO +33 -2
  2. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/README.md +31 -0
  3. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/archive.py +2 -2
  4. flatdata_py-0.4.11/flatdata/lib/data_access.py +168 -0
  5. flatdata_py-0.4.11/flatdata/lib/data_access.py.orig +204 -0
  6. flatdata_py-0.4.11/flatdata/lib/data_access_BACKUP_91129.py +219 -0
  7. flatdata_py-0.4.11/flatdata/lib/data_access_LOCAL_91129.py +112 -0
  8. flatdata_py-0.4.11/flatdata/lib/data_access_REMOTE_91129.py +168 -0
  9. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/resources.py +60 -14
  10. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/structure.py +21 -10
  11. flatdata_py-0.4.11/flatdata/lib/structure.py.orig +92 -0
  12. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/pyproject.toml +2 -2
  13. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/.gitignore +0 -0
  14. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/__init__.py +0 -0
  15. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/archive_builder.py +0 -0
  16. /flatdata_py-0.4.10/flatdata/lib/data_access.py → /flatdata_py-0.4.11/flatdata/lib/data_access_BASE_91129.py +0 -0
  17. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/errors.py +0 -0
  18. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/file_resource_storage.py +0 -0
  19. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/file_resource_writer.py +0 -0
  20. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/flatdata_writer.py +0 -0
  21. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/inspector.py +0 -0
  22. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/resource_storage.py +0 -0
  23. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/tar_archive_resource_storage.py +0 -0
  24. {flatdata_py-0.4.10 → flatdata_py-0.4.11}/flatdata/lib/writer.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: flatdata-py
3
- Version: 0.4.10
3
+ Version: 0.4.11
4
4
  Summary: Python 3 implementation of Flatdata
5
5
  Project-URL: Homepage, https://github.com/heremaps/flatdata
6
6
  Author: Flatdata Developers
7
7
  Classifier: License :: OSI Approved :: Apache Software License
8
8
  Classifier: Operating System :: OS Independent
9
9
  Classifier: Programming Language :: Python :: 3
10
- Requires-Dist: flatdata-generator==0.4.10
10
+ Requires-Dist: flatdata-generator==0.4.11
11
11
  Requires-Dist: numpy
12
12
  Requires-Dist: pandas
13
13
  Provides-Extra: inspector
@@ -34,6 +34,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
34
34
  flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
35
35
  ```
36
36
 
37
+ ## Performance tips
38
+
39
+ `flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
40
+
41
+ Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
42
+
43
+ ```python
44
+ count = sum(1 for x in archive.links if x.speed_limit > 100)
45
+ ```
46
+
47
+ For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
48
+
49
+ ```python
50
+ # single column access, returns a pandas DataFrame
51
+ df = archive.links.speed_limit
52
+ count = len(df[df['speed_limit'] > 100])
53
+
54
+ # full NumPy structured array with all fields
55
+ arr = archive.links.to_numpy()
56
+ count = int(np.sum(arr['speed_limit'] > 100))
57
+
58
+ # slices work too
59
+ arr = archive.links[1000:2000].to_numpy()
60
+ df = archive.links[::10].to_data_frame()
61
+ ```
62
+
63
+ * Use `vector.field_name` (column access) when you only need one or a few fields.
64
+ * Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
65
+ * Use `vector[i].field` for random access to individual elements.
66
+ * The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
67
+
37
68
  ## Using the inspector
38
69
 
39
70
  `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:
@@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
18
18
  flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
19
19
  ```
20
20
 
21
+ ## Performance tips
22
+
23
+ `flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
24
+
25
+ Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
26
+
27
+ ```python
28
+ count = sum(1 for x in archive.links if x.speed_limit > 100)
29
+ ```
30
+
31
+ For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
32
+
33
+ ```python
34
+ # single column access, returns a pandas DataFrame
35
+ df = archive.links.speed_limit
36
+ count = len(df[df['speed_limit'] > 100])
37
+
38
+ # full NumPy structured array with all fields
39
+ arr = archive.links.to_numpy()
40
+ count = int(np.sum(arr['speed_limit'] > 100))
41
+
42
+ # slices work too
43
+ arr = archive.links[1000:2000].to_numpy()
44
+ df = archive.links[::10].to_data_frame()
45
+ ```
46
+
47
+ * Use `vector.field_name` (column access) when you only need one or a few fields.
48
+ * Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
49
+ * Use `vector[i].field` for random access to individual elements.
50
+ * The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
51
+
21
52
  ## Using the inspector
22
53
 
23
54
  `flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:
@@ -39,9 +39,9 @@ class Archive:
39
39
  self.__getattr__(name)
40
40
 
41
41
  def __getattr__(self, name):
42
- if name not in list(self._RESOURCES.keys()):
42
+ if name not in self._RESOURCES:
43
43
  raise AttributeError("Resource %s not defined in archive." % name)
44
- if name not in list(self._loaded_resources.keys()):
44
+ if name not in self._loaded_resources:
45
45
  self._loaded_resources[name] = self._open_resource(name)
46
46
  return self._loaded_resources[name]
47
47
 
@@ -0,0 +1,168 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ import numpy as np
7
+
8
+ # Sign bits cache for the value reading.
9
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
10
+
11
+
12
+ def make_field_reader(offset_bits, num_bits, is_signed):
13
+ """Build a specialized closure for reading a single field from a structure.
14
+
15
+ Returns a function reader(data, pos_bytes) that reads the field value
16
+ from ``data`` at byte position ``pos_bytes``. All constants (byte offset,
17
+ bit shift, mask, sign handling) are pre-computed and captured by the
18
+ closure so the hot path does minimal work.
19
+ """
20
+ offset_bytes, offset_extra = divmod(offset_bits, 8)
21
+ total_bytes = (num_bits + 7) // 8
22
+ end_byte = offset_bytes + total_bytes
23
+ mask = (1 << num_bits) - 1
24
+ needs_extra = (total_bytes * 8 - offset_extra) < num_bits
25
+ extra_shift = total_bytes * 8 - offset_extra
26
+
27
+ if num_bits == 1:
28
+ bit_mask = 1 << offset_extra
29
+ def reader(data, pos):
30
+ return int((data[pos + offset_bytes] & bit_mask) != 0)
31
+ return reader
32
+
33
+ if is_signed:
34
+ sign_bit = _SIGN_BITS[num_bits]
35
+ sign_mask = sign_bit - 1
36
+ if needs_extra:
37
+ def reader(data, pos):
38
+ result = int.from_bytes(
39
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
40
+ result >>= offset_extra
41
+ result |= data[pos + end_byte] << extra_shift
42
+ result &= mask
43
+ return (result & sign_mask) - (result & sign_bit)
44
+ elif offset_extra:
45
+ def reader(data, pos):
46
+ result = (int.from_bytes(
47
+ data[pos + offset_bytes: pos + end_byte],
48
+ byteorder="little") >> offset_extra) & mask
49
+ return (result & sign_mask) - (result & sign_bit)
50
+ else:
51
+ def reader(data, pos):
52
+ result = int.from_bytes(
53
+ data[pos + offset_bytes: pos + end_byte],
54
+ byteorder="little") & mask
55
+ return (result & sign_mask) - (result & sign_bit)
56
+ return reader
57
+
58
+ # Unsigned paths
59
+ if needs_extra:
60
+ def reader(data, pos):
61
+ result = int.from_bytes(
62
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
63
+ result >>= offset_extra
64
+ result |= data[pos + end_byte] << extra_shift
65
+ return result & mask
66
+ elif offset_extra:
67
+ def reader(data, pos):
68
+ return (int.from_bytes(
69
+ data[pos + offset_bytes: pos + end_byte],
70
+ byteorder="little") >> offset_extra) & mask
71
+ else:
72
+ def reader(data, pos):
73
+ return int.from_bytes(
74
+ data[pos + offset_bytes: pos + end_byte],
75
+ byteorder="little") & mask
76
+ return reader
77
+
78
+
79
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
80
+ """Read a bit-packed field from all elements at once, returning a numpy array.
81
+
82
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
83
+ :param field_offset_bits: bit offset of the field within each element
84
+ :param field_width_bits: width of the field in bits (max 64)
85
+ :param is_signed: whether to sign-extend the result
86
+ :return: numpy array of field values
87
+ """
88
+ if field_width_bits == 1:
89
+ byte_idx = field_offset_bits // 8
90
+ bit_idx = field_offset_bits % 8
91
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
92
+ np.uint64(1))
93
+
94
+ byte_start = field_offset_bits // 8
95
+ bit_shift = field_offset_bits % 8
96
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
97
+
98
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
99
+ # then broadcast back to the array.
100
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
101
+ for b in range(min(bytes_needed, 8)):
102
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
103
+ result >>= np.uint64(bit_shift)
104
+
105
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
106
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
107
+ if bits_so_far < field_width_bits and bytes_needed > 8:
108
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
109
+ result |= extra << np.uint64(bits_so_far)
110
+
111
+ if field_width_bits < 64:
112
+ result &= np.uint64((1 << field_width_bits) - 1)
113
+
114
+ if is_signed:
115
+ if field_width_bits == 64:
116
+ return result.view(np.int64)
117
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
118
+ offset = -(1 << field_width_bits)
119
+ signed = result.astype(np.int64) + np.int64(offset)
120
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
121
+
122
+ return result
123
+
124
+
125
+ def read_value(data, offset_bits, num_bits, is_signed):
126
+ """Read a bit-packed value from data at the given bit offset.
127
+
128
+ This is a convenience wrapper around :func:`make_field_reader` for one-off
129
+ reads. For repeated reads of the same field, prefer building a reader once
130
+ with ``make_field_reader`` and reusing it.
131
+ """
132
+ reader = make_field_reader(offset_bits, num_bits, is_signed)
133
+ return reader(data, 0)
134
+
135
+
136
+ def write_value(data, offset_bits, num_bits, is_signed, value):
137
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
138
+
139
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
140
+ total_bytes = (num_bits + 7) // 8
141
+
142
+ if num_bits == 1:
143
+ if value == 1:
144
+ data[offset_bytes] |= 1 << offset_extra_bits
145
+ else:
146
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
147
+ return
148
+
149
+ mask = (1 << num_bits) - 1
150
+ value <<= offset_extra_bits
151
+ value &= mask << offset_extra_bits
152
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
153
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
154
+
155
+ byte_idx = 0
156
+ data[offset_bytes] = value_in_little_endian[byte_idx]
157
+ data[offset_bytes] |= surrounding_bits
158
+
159
+ byte_idx += 1
160
+ while byte_idx < total_bytes:
161
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
162
+ byte_idx += 1
163
+
164
+ bits_written = total_bytes * 8 - offset_extra_bits
165
+ if bits_written < num_bits:
166
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
167
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
168
+ data[offset_bytes + byte_idx] |= surrounding_bits
@@ -0,0 +1,204 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ # Sign bits cache for the value reading.
7
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
8
+
9
+
10
+ def make_field_reader(offset_bits, num_bits, is_signed):
11
+ """Build a specialized closure for reading a single field from a structure.
12
+
13
+ Returns a function reader(data, pos_bytes) that reads the field value
14
+ from ``data`` at byte position ``pos_bytes``. All constants (byte offset,
15
+ bit shift, mask, sign handling) are pre-computed and captured by the
16
+ closure so the hot path does minimal work.
17
+ """
18
+ offset_bytes, offset_extra = divmod(offset_bits, 8)
19
+ total_bytes = (num_bits + 7) // 8
20
+ <<<<<<< HEAD
21
+
22
+ if num_bits == 1:
23
+ return int((data[offset_bytes] & (1 << offset_extra_bits)) != 0)
24
+
25
+ result = int.from_bytes(data[offset_bytes: offset_bytes + total_bytes], byteorder="little")
26
+ result >>= offset_extra_bits
27
+ if (total_bytes * 8 - offset_extra_bits) < num_bits:
28
+ remainder = data[offset_bytes + total_bytes]
29
+ result |= remainder << (total_bytes * 8 - offset_extra_bits)
30
+
31
+ if num_bits < 64:
32
+ result = result & ((1 << num_bits) - 1)
33
+
34
+ if not is_signed:
35
+ return result
36
+
37
+ return (result & (_SIGN_BITS[num_bits] - 1)) - (result & _SIGN_BITS[num_bits])
38
+
39
+
40
+ def write_value(data, offset_bits, num_bits, is_signed, value):
41
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
42
+
43
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
44
+ total_bytes = (num_bits + 7) // 8
45
+
46
+ if num_bits == 1:
47
+ if value == 1:
48
+ data[offset_bytes] |= 1 << offset_extra_bits
49
+ else:
50
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
51
+ return
52
+
53
+ =======
54
+ end_byte = offset_bytes + total_bytes
55
+ >>>>>>> e486615 (Also improve scalar readers by caching)
56
+ mask = (1 << num_bits) - 1
57
+ needs_extra = (total_bytes * 8 - offset_extra) < num_bits
58
+ extra_shift = total_bytes * 8 - offset_extra
59
+
60
+ if num_bits == 1:
61
+ bit_mask = 1 << offset_extra
62
+ def reader(data, pos):
63
+ return int((data[pos + offset_bytes] & bit_mask) != 0)
64
+ return reader
65
+
66
+ if is_signed:
67
+ sign_bit = _SIGN_BITS[num_bits]
68
+ sign_mask = sign_bit - 1
69
+ if needs_extra:
70
+ def reader(data, pos):
71
+ result = int.from_bytes(
72
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
73
+ result >>= offset_extra
74
+ result |= data[pos + end_byte] << extra_shift
75
+ result &= mask
76
+ return (result & sign_mask) - (result & sign_bit)
77
+ elif offset_extra:
78
+ def reader(data, pos):
79
+ result = (int.from_bytes(
80
+ data[pos + offset_bytes: pos + end_byte],
81
+ byteorder="little") >> offset_extra) & mask
82
+ return (result & sign_mask) - (result & sign_bit)
83
+ else:
84
+ def reader(data, pos):
85
+ result = int.from_bytes(
86
+ data[pos + offset_bytes: pos + end_byte],
87
+ byteorder="little") & mask
88
+ return (result & sign_mask) - (result & sign_bit)
89
+ return reader
90
+
91
+ <<<<<<< HEAD
92
+ =======
93
+ # Unsigned paths
94
+ if needs_extra:
95
+ def reader(data, pos):
96
+ result = int.from_bytes(
97
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
98
+ result >>= offset_extra
99
+ result |= data[pos + end_byte] << extra_shift
100
+ return result & mask
101
+ elif offset_extra:
102
+ def reader(data, pos):
103
+ return (int.from_bytes(
104
+ data[pos + offset_bytes: pos + end_byte],
105
+ byteorder="little") >> offset_extra) & mask
106
+ else:
107
+ def reader(data, pos):
108
+ return int.from_bytes(
109
+ data[pos + offset_bytes: pos + end_byte],
110
+ byteorder="little") & mask
111
+ return reader
112
+
113
+
114
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
115
+ """Read a bit-packed field from all elements at once, returning a numpy array.
116
+
117
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
118
+ :param field_offset_bits: bit offset of the field within each element
119
+ :param field_width_bits: width of the field in bits (max 64)
120
+ :param is_signed: whether to sign-extend the result
121
+ :return: numpy array of field values
122
+ """
123
+ if field_width_bits == 1:
124
+ byte_idx = field_offset_bits // 8
125
+ bit_idx = field_offset_bits % 8
126
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
127
+ np.uint64(1))
128
+
129
+ byte_start = field_offset_bits // 8
130
+ bit_shift = field_offset_bits % 8
131
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
132
+
133
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
134
+ # then broadcast back to the array.
135
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
136
+ for b in range(min(bytes_needed, 8)):
137
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
138
+ result >>= np.uint64(bit_shift)
139
+
140
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
141
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
142
+ if bits_so_far < field_width_bits and bytes_needed > 8:
143
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
144
+ result |= extra << np.uint64(bits_so_far)
145
+
146
+ if field_width_bits < 64:
147
+ result &= np.uint64((1 << field_width_bits) - 1)
148
+
149
+ if is_signed:
150
+ if field_width_bits == 64:
151
+ return result.view(np.int64)
152
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
153
+ offset = -(1 << field_width_bits)
154
+ signed = result.astype(np.int64) + np.int64(offset)
155
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
156
+
157
+ return result
158
+
159
+
160
+ def read_value(data, offset_bits, num_bits, is_signed):
161
+ """Read a bit-packed value from data at the given bit offset.
162
+
163
+ This is a convenience wrapper around :func:`make_field_reader` for one-off
164
+ reads. For repeated reads of the same field, prefer building a reader once
165
+ with ``make_field_reader`` and reusing it.
166
+ """
167
+ reader = make_field_reader(offset_bits, num_bits, is_signed)
168
+ return reader(data, 0)
169
+
170
+
171
+ def write_value(data, offset_bits, num_bits, is_signed, value):
172
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
173
+
174
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
175
+ total_bytes = (num_bits + 7) // 8
176
+
177
+ if num_bits == 1:
178
+ if value == 1:
179
+ data[offset_bytes] |= 1 << offset_extra_bits
180
+ else:
181
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
182
+ return
183
+
184
+ mask = (1 << num_bits) - 1
185
+ value <<= offset_extra_bits
186
+ value &= mask << offset_extra_bits
187
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
188
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
189
+
190
+ byte_idx = 0
191
+ data[offset_bytes] = value_in_little_endian[byte_idx]
192
+ data[offset_bytes] |= surrounding_bits
193
+
194
+ byte_idx += 1
195
+ while byte_idx < total_bytes:
196
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
197
+ byte_idx += 1
198
+
199
+ >>>>>>> e486615 (Also improve scalar readers by caching)
200
+ bits_written = total_bytes * 8 - offset_extra_bits
201
+ if bits_written < num_bits:
202
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
203
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
204
+ data[offset_bytes + byte_idx] |= surrounding_bits
@@ -0,0 +1,219 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ import numpy as np
7
+
8
+ # Sign bits cache for the value reading.
9
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
10
+
11
+
12
+ def make_field_reader(offset_bits, num_bits, is_signed):
13
+ """Build a specialized closure for reading a single field from a structure.
14
+
15
+ Returns a function reader(data, pos_bytes) that reads the field value
16
+ from ``data`` at byte position ``pos_bytes``. All constants (byte offset,
17
+ bit shift, mask, sign handling) are pre-computed and captured by the
18
+ closure so the hot path does minimal work.
19
+ """
20
+ offset_bytes, offset_extra = divmod(offset_bits, 8)
21
+ total_bytes = (num_bits + 7) // 8
22
+ end_byte = offset_bytes + total_bytes
23
+ mask = (1 << num_bits) - 1
24
+ needs_extra = (total_bytes * 8 - offset_extra) < num_bits
25
+ extra_shift = total_bytes * 8 - offset_extra
26
+
27
+ if num_bits == 1:
28
+ bit_mask = 1 << offset_extra
29
+ def reader(data, pos):
30
+ return int((data[pos + offset_bytes] & bit_mask) != 0)
31
+ return reader
32
+
33
+ if is_signed:
34
+ sign_bit = _SIGN_BITS[num_bits]
35
+ sign_mask = sign_bit - 1
36
+ if needs_extra:
37
+ def reader(data, pos):
38
+ result = int.from_bytes(
39
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
40
+ result >>= offset_extra
41
+ result |= data[pos + end_byte] << extra_shift
42
+ result &= mask
43
+ return (result & sign_mask) - (result & sign_bit)
44
+ elif offset_extra:
45
+ def reader(data, pos):
46
+ result = (int.from_bytes(
47
+ data[pos + offset_bytes: pos + end_byte],
48
+ byteorder="little") >> offset_extra) & mask
49
+ return (result & sign_mask) - (result & sign_bit)
50
+ else:
51
+ def reader(data, pos):
52
+ result = int.from_bytes(
53
+ data[pos + offset_bytes: pos + end_byte],
54
+ byteorder="little") & mask
55
+ return (result & sign_mask) - (result & sign_bit)
56
+ return reader
57
+
58
+ <<<<<<< HEAD
59
+ if num_bits < 64 or offset_extra_bits > 0:
60
+ result = result & ((1 << num_bits) - 1)
61
+ =======
62
+ # Unsigned paths
63
+ if needs_extra:
64
+ def reader(data, pos):
65
+ result = int.from_bytes(
66
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
67
+ result >>= offset_extra
68
+ result |= data[pos + end_byte] << extra_shift
69
+ return result & mask
70
+ elif offset_extra:
71
+ def reader(data, pos):
72
+ return (int.from_bytes(
73
+ data[pos + offset_bytes: pos + end_byte],
74
+ byteorder="little") >> offset_extra) & mask
75
+ else:
76
+ def reader(data, pos):
77
+ return int.from_bytes(
78
+ data[pos + offset_bytes: pos + end_byte],
79
+ byteorder="little") & mask
80
+ return reader
81
+ >>>>>>> c635308 (Also improve scalar readers by caching)
82
+
83
+
84
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
85
+ """Read a bit-packed field from all elements at once, returning a numpy array.
86
+
87
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
88
+ :param field_offset_bits: bit offset of the field within each element
89
+ :param field_width_bits: width of the field in bits (max 64)
90
+ :param is_signed: whether to sign-extend the result
91
+ :return: numpy array of field values
92
+ """
93
+ if field_width_bits == 1:
94
+ byte_idx = field_offset_bits // 8
95
+ bit_idx = field_offset_bits % 8
96
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
97
+ np.uint64(1))
98
+
99
+ byte_start = field_offset_bits // 8
100
+ bit_shift = field_offset_bits % 8
101
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
102
+
103
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
104
+ # then broadcast back to the array.
105
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
106
+ for b in range(min(bytes_needed, 8)):
107
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
108
+ result >>= np.uint64(bit_shift)
109
+
110
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
111
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
112
+ if bits_so_far < field_width_bits and bytes_needed > 8:
113
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
114
+ result |= extra << np.uint64(bits_so_far)
115
+
116
+ if field_width_bits < 64:
117
+ result &= np.uint64((1 << field_width_bits) - 1)
118
+
119
+ if is_signed:
120
+ if field_width_bits == 64:
121
+ return result.view(np.int64)
122
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
123
+ offset = -(1 << field_width_bits)
124
+ signed = result.astype(np.int64) + np.int64(offset)
125
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
126
+
127
+ return result
128
+
129
+
130
+ def read_value(data, offset_bits, num_bits, is_signed):
131
+ """Read a bit-packed value from data at the given bit offset.
132
+
133
+ This is a convenience wrapper around :func:`make_field_reader` for one-off
134
+ reads. For repeated reads of the same field, prefer building a reader once
135
+ with ``make_field_reader`` and reusing it.
136
+ """
137
+ reader = make_field_reader(offset_bits, num_bits, is_signed)
138
+ return reader(data, 0)
139
+
140
+
141
+ def write_value(data, offset_bits, num_bits, is_signed, value):
142
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
143
+
144
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
145
+ total_bytes = (num_bits + 7) // 8
146
+
147
+ if num_bits == 1:
148
+ if value == 1:
149
+ data[offset_bytes] |= 1 << offset_extra_bits
150
+ else:
151
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
152
+ return
153
+
154
+ mask = (1 << num_bits) - 1
155
+ value <<= offset_extra_bits
156
+ value &= mask << offset_extra_bits
157
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
158
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
159
+
160
+ byte_idx = 0
161
+ data[offset_bytes] = value_in_little_endian[byte_idx]
162
+ data[offset_bytes] |= surrounding_bits
163
+
164
+ byte_idx += 1
165
+ while byte_idx < total_bytes:
166
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
167
+ byte_idx += 1
168
+
169
+ bits_written = total_bytes * 8 - offset_extra_bits
170
+ if bits_written < num_bits:
171
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
172
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
173
+ data[offset_bytes + byte_idx] |= surrounding_bits
174
+
175
+
176
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
177
+ """Read a bit-packed field from all elements at once, returning a numpy array.
178
+
179
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
180
+ :param field_offset_bits: bit offset of the field within each element
181
+ :param field_width_bits: width of the field in bits (max 64)
182
+ :param is_signed: whether to sign-extend the result
183
+ :return: numpy array of field values
184
+ """
185
+ if field_width_bits == 1:
186
+ byte_idx = field_offset_bits // 8
187
+ bit_idx = field_offset_bits % 8
188
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
189
+ np.uint64(1))
190
+
191
+ byte_start = field_offset_bits // 8
192
+ bit_shift = field_offset_bits % 8
193
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
194
+
195
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
196
+ # then broadcast back to the array.
197
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
198
+ for b in range(min(bytes_needed, 8)):
199
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
200
+ result >>= np.uint64(bit_shift)
201
+
202
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
203
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
204
+ if bits_so_far < field_width_bits and bytes_needed > 8:
205
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
206
+ result |= extra << np.uint64(bits_so_far)
207
+
208
+ if field_width_bits < 64:
209
+ result &= np.uint64((1 << field_width_bits) - 1)
210
+
211
+ if is_signed:
212
+ if field_width_bits == 64:
213
+ return result.view(np.int64)
214
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
215
+ offset = -(1 << field_width_bits)
216
+ signed = result.astype(np.int64) + np.int64(offset)
217
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
218
+
219
+ return result
@@ -0,0 +1,112 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ import numpy as np
7
+
8
+ # Sign bits cache for the value reading.
9
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
10
+
11
+
12
+ def read_value(data, offset_bits, num_bits, is_signed):
13
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
14
+ total_bytes = (num_bits + 7) // 8
15
+
16
+ if num_bits == 1:
17
+ return int((data[offset_bytes] & (1 << offset_extra_bits)) != 0)
18
+
19
+ result = int.from_bytes(data[offset_bytes: offset_bytes + total_bytes], byteorder="little")
20
+ result >>= offset_extra_bits
21
+ if (total_bytes * 8 - offset_extra_bits) < num_bits:
22
+ remainder = data[offset_bytes + total_bytes]
23
+ result |= remainder << (total_bytes * 8 - offset_extra_bits)
24
+
25
+ if num_bits < 64 or offset_extra_bits > 0:
26
+ result = result & ((1 << num_bits) - 1)
27
+
28
+ if not is_signed:
29
+ return result
30
+
31
+ return (result & (_SIGN_BITS[num_bits] - 1)) - (result & _SIGN_BITS[num_bits])
32
+
33
+
34
+ def write_value(data, offset_bits, num_bits, is_signed, value):
35
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
36
+
37
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
38
+ total_bytes = (num_bits + 7) // 8
39
+
40
+ if num_bits == 1:
41
+ if value == 1:
42
+ data[offset_bytes] |= 1 << offset_extra_bits
43
+ else:
44
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
45
+ return
46
+
47
+ mask = (1 << num_bits) - 1
48
+ value <<= offset_extra_bits
49
+ value &= mask << offset_extra_bits
50
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
51
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
52
+
53
+ byte_idx = 0
54
+ data[offset_bytes] = value_in_little_endian[byte_idx]
55
+ data[offset_bytes] |= surrounding_bits
56
+
57
+ byte_idx += 1
58
+ while byte_idx < total_bytes:
59
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
60
+ byte_idx += 1
61
+
62
+ bits_written = total_bytes * 8 - offset_extra_bits
63
+ if bits_written < num_bits:
64
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
65
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
66
+ data[offset_bytes + byte_idx] |= surrounding_bits
67
+
68
+
69
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
70
+ """Read a bit-packed field from all elements at once, returning a numpy array.
71
+
72
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
73
+ :param field_offset_bits: bit offset of the field within each element
74
+ :param field_width_bits: width of the field in bits (max 64)
75
+ :param is_signed: whether to sign-extend the result
76
+ :return: numpy array of field values
77
+ """
78
+ if field_width_bits == 1:
79
+ byte_idx = field_offset_bits // 8
80
+ bit_idx = field_offset_bits % 8
81
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
82
+ np.uint64(1))
83
+
84
+ byte_start = field_offset_bits // 8
85
+ bit_shift = field_offset_bits % 8
86
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
87
+
88
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
89
+ # then broadcast back to the array.
90
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
91
+ for b in range(min(bytes_needed, 8)):
92
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
93
+ result >>= np.uint64(bit_shift)
94
+
95
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
96
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
97
+ if bits_so_far < field_width_bits and bytes_needed > 8:
98
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
99
+ result |= extra << np.uint64(bits_so_far)
100
+
101
+ if field_width_bits < 64:
102
+ result &= np.uint64((1 << field_width_bits) - 1)
103
+
104
+ if is_signed:
105
+ if field_width_bits == 64:
106
+ return result.view(np.int64)
107
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
108
+ offset = -(1 << field_width_bits)
109
+ signed = result.astype(np.int64) + np.int64(offset)
110
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
111
+
112
+ return result
@@ -0,0 +1,168 @@
1
+ '''
2
+ Copyright (c) 2017 HERE Europe B.V.
3
+ See the LICENSE file in the root of this project for license details.
4
+ '''
5
+
6
+ import numpy as np
7
+
8
+ # Sign bits cache for the value reading.
9
+ _SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
10
+
11
+
12
+ def make_field_reader(offset_bits, num_bits, is_signed):
13
+ """Build a specialized closure for reading a single field from a structure.
14
+
15
+ Returns a function reader(data, pos_bytes) that reads the field value
16
+ from ``data`` at byte position ``pos_bytes``. All constants (byte offset,
17
+ bit shift, mask, sign handling) are pre-computed and captured by the
18
+ closure so the hot path does minimal work.
19
+ """
20
+ offset_bytes, offset_extra = divmod(offset_bits, 8)
21
+ total_bytes = (num_bits + 7) // 8
22
+ end_byte = offset_bytes + total_bytes
23
+ mask = (1 << num_bits) - 1
24
+ needs_extra = (total_bytes * 8 - offset_extra) < num_bits
25
+ extra_shift = total_bytes * 8 - offset_extra
26
+
27
+ if num_bits == 1:
28
+ bit_mask = 1 << offset_extra
29
+ def reader(data, pos):
30
+ return int((data[pos + offset_bytes] & bit_mask) != 0)
31
+ return reader
32
+
33
+ if is_signed:
34
+ sign_bit = _SIGN_BITS[num_bits]
35
+ sign_mask = sign_bit - 1
36
+ if needs_extra:
37
+ def reader(data, pos):
38
+ result = int.from_bytes(
39
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
40
+ result >>= offset_extra
41
+ result |= data[pos + end_byte] << extra_shift
42
+ result &= mask
43
+ return (result & sign_mask) - (result & sign_bit)
44
+ elif offset_extra:
45
+ def reader(data, pos):
46
+ result = (int.from_bytes(
47
+ data[pos + offset_bytes: pos + end_byte],
48
+ byteorder="little") >> offset_extra) & mask
49
+ return (result & sign_mask) - (result & sign_bit)
50
+ else:
51
+ def reader(data, pos):
52
+ result = int.from_bytes(
53
+ data[pos + offset_bytes: pos + end_byte],
54
+ byteorder="little") & mask
55
+ return (result & sign_mask) - (result & sign_bit)
56
+ return reader
57
+
58
+ # Unsigned paths
59
+ if needs_extra:
60
+ def reader(data, pos):
61
+ result = int.from_bytes(
62
+ data[pos + offset_bytes: pos + end_byte], byteorder="little")
63
+ result >>= offset_extra
64
+ result |= data[pos + end_byte] << extra_shift
65
+ return result & mask
66
+ elif offset_extra:
67
+ def reader(data, pos):
68
+ return (int.from_bytes(
69
+ data[pos + offset_bytes: pos + end_byte],
70
+ byteorder="little") >> offset_extra) & mask
71
+ else:
72
+ def reader(data, pos):
73
+ return int.from_bytes(
74
+ data[pos + offset_bytes: pos + end_byte],
75
+ byteorder="little") & mask
76
+ return reader
77
+
78
+
79
+ def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
80
+ """Read a bit-packed field from all elements at once, returning a numpy array.
81
+
82
+ :param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
83
+ :param field_offset_bits: bit offset of the field within each element
84
+ :param field_width_bits: width of the field in bits (max 64)
85
+ :param is_signed: whether to sign-extend the result
86
+ :return: numpy array of field values
87
+ """
88
+ if field_width_bits == 1:
89
+ byte_idx = field_offset_bits // 8
90
+ bit_idx = field_offset_bits % 8
91
+ return ((raw_bytes_2d[:, byte_idx].astype(np.uint64) >> np.uint64(bit_idx)) &
92
+ np.uint64(1))
93
+
94
+ byte_start = field_offset_bits // 8
95
+ bit_shift = field_offset_bits % 8
96
+ bytes_needed = (bit_shift + field_width_bits + 7) // 8
97
+
98
+ # Use Python int arithmetic for the shift to avoid numpy overflow,
99
+ # then broadcast back to the array.
100
+ result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
101
+ for b in range(min(bytes_needed, 8)):
102
+ result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
103
+ result >>= np.uint64(bit_shift)
104
+
105
+ # If the field spans more than 8 bytes (unaligned 64-bit field), merge the extra byte.
106
+ bits_so_far = 8 * min(bytes_needed, 8) - bit_shift
107
+ if bits_so_far < field_width_bits and bytes_needed > 8:
108
+ extra = raw_bytes_2d[:, byte_start + 8].astype(np.uint64)
109
+ result |= extra << np.uint64(bits_so_far)
110
+
111
+ if field_width_bits < 64:
112
+ result &= np.uint64((1 << field_width_bits) - 1)
113
+
114
+ if is_signed:
115
+ if field_width_bits == 64:
116
+ return result.view(np.int64)
117
+ sign_bit = np.uint64(1 << (field_width_bits - 1))
118
+ offset = -(1 << field_width_bits)
119
+ signed = result.astype(np.int64) + np.int64(offset)
120
+ result = np.where(result & sign_bit, signed, result.astype(np.int64))
121
+
122
+ return result
123
+
124
+
125
+ def read_value(data, offset_bits, num_bits, is_signed):
126
+ """Read a bit-packed value from data at the given bit offset.
127
+
128
+ This is a convenience wrapper around :func:`make_field_reader` for one-off
129
+ reads. For repeated reads of the same field, prefer building a reader once
130
+ with ``make_field_reader`` and reusing it.
131
+ """
132
+ reader = make_field_reader(offset_bits, num_bits, is_signed)
133
+ return reader(data, 0)
134
+
135
+
136
+ def write_value(data, offset_bits, num_bits, is_signed, value):
137
+ assert num_bits <= 64, f'Number of bits to write is greater than 64'
138
+
139
+ offset_bytes, offset_extra_bits = divmod(offset_bits, 8)
140
+ total_bytes = (num_bits + 7) // 8
141
+
142
+ if num_bits == 1:
143
+ if value == 1:
144
+ data[offset_bytes] |= 1 << offset_extra_bits
145
+ else:
146
+ data[offset_bytes] &= ~(1 << offset_extra_bits)
147
+ return
148
+
149
+ mask = (1 << num_bits) - 1
150
+ value <<= offset_extra_bits
151
+ value &= mask << offset_extra_bits
152
+ value_in_little_endian = value.to_bytes(total_bytes + 1, byteorder="little", signed=is_signed)
153
+ surrounding_bits = data[offset_bytes] & ((1 << offset_bits) - 1)
154
+
155
+ byte_idx = 0
156
+ data[offset_bytes] = value_in_little_endian[byte_idx]
157
+ data[offset_bytes] |= surrounding_bits
158
+
159
+ byte_idx += 1
160
+ while byte_idx < total_bytes:
161
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx]
162
+ byte_idx += 1
163
+
164
+ bits_written = total_bytes * 8 - offset_extra_bits
165
+ if bits_written < num_bits:
166
+ surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
167
+ data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
168
+ data[offset_bytes + byte_idx] |= surrounding_bits
@@ -8,7 +8,7 @@ import json
8
8
  import pandas as pd
9
9
  import numpy as np
10
10
 
11
- from .data_access import read_value
11
+ from .data_access import read_value, read_field_vectorized
12
12
  from .errors import CorruptResourceError
13
13
 
14
14
  SIZE_OFFSET_IN_BITS = 64
@@ -24,6 +24,7 @@ class ResourceBase:
24
24
  self._element_type = element_type
25
25
  self._element_types = [element_type]
26
26
  self._type_size_in_bytes = self._element_type._SIZE_IN_BYTES if self._element_type else 1
27
+ self._raw_numpy_2d = None
27
28
 
28
29
  def size_in_bytes(self):
29
30
  return len(self._mem)
@@ -35,6 +36,20 @@ class ResourceBase:
35
36
  offset = self._item_offset(index)
36
37
  return self._element_type(self._mem, offset)
37
38
 
39
+ def _as_numpy_2d(self):
40
+ """Return the raw data as a 2D numpy uint8 array of shape (n, struct_size).
41
+ Zero-copy via np.frombuffer on the mmap'd memory. Cached after first call.
42
+ """
43
+ if self._raw_numpy_2d is None:
44
+ n = len(self)
45
+ struct_size = self._type_size_in_bytes
46
+ raw = np.frombuffer(
47
+ self._mem[SIZE_OFFSET_IN_BYTES:SIZE_OFFSET_IN_BYTES + n * struct_size],
48
+ dtype=np.uint8,
49
+ )
50
+ self._raw_numpy_2d = raw.reshape(n, struct_size)
51
+ return self._raw_numpy_2d
52
+
38
53
  def _repr_attributes(self):
39
54
  return {
40
55
  "container_type": self.__class__.__name__,
@@ -60,14 +75,18 @@ class _VectorSlice:
60
75
  self._sequence = sequence
61
76
 
62
77
  def to_numpy(self, limit=None):
63
- indices = self._slice.indices(len(self._sequence))
64
- num_items = len(range(*indices)) if not limit else limit
65
- result = np.empty(
66
- shape=num_items,
67
- dtype=self._sequence._element_type.dtype()
68
- )
69
- for index, item in enumerate(self):
70
- result[index] = item.as_tuple()
78
+ raw_2d = self._sequence._as_numpy_2d()
79
+ sliced = raw_2d[self._slice]
80
+ if limit is not None:
81
+ sliced = sliced[:limit]
82
+
83
+ fields = self._sequence._element_type._FIELDS
84
+ dtype = self._sequence._element_type.dtype()
85
+ result = np.empty(sliced.shape[0], dtype=dtype)
86
+ for name, field in fields.items():
87
+ result[name] = read_field_vectorized(
88
+ sliced, field.offset, field.width, field.is_signed
89
+ )
71
90
  return result
72
91
 
73
92
  def to_data_frame(self, limit=None):
@@ -78,7 +97,13 @@ class _VectorSlice:
78
97
  yield self._sequence[i]
79
98
 
80
99
  def __getattr__(self, name):
81
- return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name])
100
+ try:
101
+ field = self._sequence._element_type._FIELDS[name]
102
+ except KeyError:
103
+ raise AttributeError("Field %s not found in structure" % name)
104
+ raw_2d = self._sequence._as_numpy_2d()[self._slice]
105
+ values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed)
106
+ return pd.DataFrame(data=values, columns=[name])
82
107
 
83
108
  def __repr__(self):
84
109
  return "Displaying first 100 records:\n" + self.to_data_frame(limit=100).__repr__()
@@ -92,8 +117,20 @@ class Vector(ResourceBase):
92
117
  assert rem == 0, "Malformed vector"
93
118
  self._size = size
94
119
 
120
+ def to_numpy(self):
121
+ """Convert entire vector to a numpy structured array (vectorized)."""
122
+ raw_2d = self._as_numpy_2d()
123
+ fields = self._element_type._FIELDS
124
+ dtype = self._element_type.dtype()
125
+ result = np.empty(self._size, dtype=dtype)
126
+ for name, field in fields.items():
127
+ result[name] = read_field_vectorized(
128
+ raw_2d, field.offset, field.width, field.is_signed
129
+ )
130
+ return result
131
+
95
132
  def to_data_frame(self):
96
- return self[:].to_data_frame()
133
+ return pd.DataFrame(data=self.to_numpy())
97
134
 
98
135
  def __getitem__(self, index):
99
136
  if isinstance(index, slice):
@@ -106,11 +143,20 @@ class Vector(ResourceBase):
106
143
  return self._get_item(index)
107
144
 
108
145
  def __iter__(self):
109
- for i in range(len(self)):
110
- yield self._get_item(i)
146
+ mem = self._mem
147
+ element_type = self._element_type
148
+ size_bytes = self._type_size_in_bytes
149
+ for i in range(self._size):
150
+ yield element_type(mem, SIZE_OFFSET_IN_BYTES + size_bytes * i)
111
151
 
112
152
  def __getattr__(self, name):
113
- return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name])
153
+ try:
154
+ field = self._element_type._FIELDS[name]
155
+ except KeyError:
156
+ raise AttributeError("Field %s not found in structure" % name)
157
+ raw_2d = self._as_numpy_2d()
158
+ values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed)
159
+ return pd.DataFrame(data=values, columns=[name])
114
160
 
115
161
  def __len__(self):
116
162
  return self._size
@@ -2,26 +2,33 @@ from collections import namedtuple
2
2
  import json
3
3
  import numpy as np
4
4
 
5
- from .data_access import read_value
5
+ from .data_access import make_field_reader
6
6
 
7
7
  FieldSignature = namedtuple(
8
8
  "FieldSignature", ["offset", "width", "is_signed", "dtype"])
9
9
 
10
10
 
11
11
  class Structure:
12
+ __slots__ = ('_mem', '_pos')
13
+ _READERS = {}
14
+
15
+ def __init_subclass__(cls, **kwargs):
16
+ super().__init_subclass__(**kwargs)
17
+ fields = cls.__dict__.get('_FIELDS')
18
+ if fields is not None:
19
+ cls._READERS = {name: make_field_reader(f.offset, f.width, f.is_signed)
20
+ for name, f in fields.items()}
21
+
12
22
  def __init__(self, mem, pos):
13
23
  self._mem = mem
14
24
  self._pos = pos
15
25
 
16
26
  def __getattr__(self, name):
17
27
  try:
18
- field = self._FIELDS[name]
28
+ reader = self._READERS[name]
19
29
  except KeyError:
20
30
  raise AttributeError("Field %s not found in structure" % name)
21
- return self._get_value(field)
22
-
23
- def _get_value(self, field):
24
- return read_value(self._mem, self._pos * 8 + field.offset, field.width, field.is_signed)
31
+ return reader(self._mem, self._pos)
25
32
 
26
33
  def __dir__(self):
27
34
  return self._FIELD_KEYS
@@ -31,20 +38,24 @@ class Structure:
31
38
  yield getattr(self, name)
32
39
 
33
40
  def as_dict(self):
34
- return {name: self._get_value(field) for name, field in self._FIELDS.items()}
41
+ mem, pos = self._mem, self._pos
42
+ return {name: reader(mem, pos) for name, reader in self._READERS.items()}
35
43
 
36
44
  def as_list(self):
37
- return [self._get_value(field) for field in self._FIELDS.values()]
45
+ mem, pos = self._mem, self._pos
46
+ return [reader(mem, pos) for reader in self._READERS.values()]
38
47
 
39
48
  def as_tuple(self):
40
- return tuple(self._get_value(field) for field in self._FIELDS.values())
49
+ mem, pos = self._mem, self._pos
50
+ return tuple(reader(mem, pos) for reader in self._READERS.values())
41
51
 
42
52
  @classmethod
43
53
  def dtype(cls):
44
54
  return [(name, np.dtype(field.dtype)) for name, field in cls._FIELDS.items()]
45
55
 
46
56
  def as_nparray(self):
47
- return np.array([tuple(self._get_value(field) for name, field in self._FIELDS.items())],
57
+ mem, pos = self._mem, self._pos
58
+ return np.array([tuple(reader(mem, pos) for reader in self._READERS.values())],
48
59
  dtype=self.dtype())
49
60
 
50
61
  def schema(self):
@@ -0,0 +1,92 @@
1
+ from collections import namedtuple
2
+ import json
3
+ import numpy as np
4
+
5
+ from .data_access import make_field_reader
6
+
7
+ FieldSignature = namedtuple(
8
+ "FieldSignature", ["offset", "width", "is_signed", "dtype"])
9
+
10
+
11
+ class Structure:
12
+ <<<<<<< HEAD
13
+ =======
14
+ __slots__ = ('_mem', '_pos')
15
+ _READERS = {}
16
+
17
+ def __init_subclass__(cls, **kwargs):
18
+ super().__init_subclass__(**kwargs)
19
+ fields = cls.__dict__.get('_FIELDS')
20
+ if fields is not None:
21
+ cls._READERS = {name: make_field_reader(f.offset, f.width, f.is_signed)
22
+ for name, f in fields.items()}
23
+
24
+ >>>>>>> e486615 (Also improve scalar readers by caching)
25
+ def __init__(self, mem, pos):
26
+ self._mem = mem
27
+ self._pos = pos
28
+
29
+ def __getattr__(self, name):
30
+ try:
31
+ reader = self._READERS[name]
32
+ except KeyError:
33
+ raise AttributeError("Field %s not found in structure" % name)
34
+ return reader(self._mem, self._pos)
35
+
36
+ def __dir__(self):
37
+ return self._FIELD_KEYS
38
+
39
+ def __iter__(self):
40
+ for name in self._FIELD_KEYS:
41
+ yield getattr(self, name)
42
+
43
+ def as_dict(self):
44
+ mem, pos = self._mem, self._pos
45
+ return {name: reader(mem, pos) for name, reader in self._READERS.items()}
46
+
47
+ def as_list(self):
48
+ mem, pos = self._mem, self._pos
49
+ return [reader(mem, pos) for reader in self._READERS.values()]
50
+
51
+ def as_tuple(self):
52
+ mem, pos = self._mem, self._pos
53
+ return tuple(reader(mem, pos) for reader in self._READERS.values())
54
+
55
+ @classmethod
56
+ def dtype(cls):
57
+ return [(name, np.dtype(field.dtype)) for name, field in cls._FIELDS.items()]
58
+
59
+ def as_nparray(self):
60
+ mem, pos = self._mem, self._pos
61
+ return np.array([tuple(reader(mem, pos) for reader in self._READERS.values())],
62
+ dtype=self.dtype())
63
+
64
+ def schema(self):
65
+ return self._SCHEMA
66
+
67
+ @classmethod
68
+ def _repr_attributes(cls):
69
+ return {
70
+ "name": cls.__name__,
71
+ "doc": cls.__doc__,
72
+ "attributes": [
73
+ {
74
+ "name": name,
75
+ "offset": signature.offset,
76
+ "width": signature.width,
77
+ "is_signed": signature.is_signed
78
+ }
79
+ for name, signature in cls._FIELDS.items()]
80
+ }
81
+
82
+ @classmethod
83
+ def __repr__(cls):
84
+ return json.dumps(cls._repr_attributes())
85
+
86
+ def __repr__(self):
87
+ return json.dumps({
88
+ "name": self.__class__.__name__,
89
+ "attributes":
90
+ {name: getattr(self, name)
91
+ for name, signature in self._FIELDS.items()}
92
+ }, indent=4)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "flatdata-py"
7
- version = "0.4.10"
7
+ version = "0.4.11"
8
8
  description = "Python 3 implementation of Flatdata"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -16,7 +16,7 @@ classifiers = [
16
16
  "Programming Language :: Python :: 3",
17
17
  ]
18
18
  dependencies = [
19
- "flatdata-generator==0.4.10",
19
+ "flatdata-generator==0.4.11",
20
20
  "numpy",
21
21
  "pandas",
22
22
  ]
File without changes