starfix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- starfix/__init__.py +3 -0
- starfix/_version.py +24 -0
- starfix/arrow_digester.py +755 -0
- starfix/py.typed +0 -0
- starfix-0.1.0.dist-info/METADATA +53 -0
- starfix-0.1.0.dist-info/RECORD +7 -0
- starfix-0.1.0.dist-info/WHEEL +4 -0
starfix/__init__.py
ADDED
starfix/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.1.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 1, 0)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,755 @@
|
|
|
1
|
+
"""Pure-Python implementation of the starfix Arrow logical hasher.
|
|
2
|
+
|
|
3
|
+
Implements the byte-layout specification defined in the starfix Rust crate
|
|
4
|
+
(``nauticalab/starfix docs/byte-layout-spec.md``).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import struct
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import pyarrow as pa
|
|
17
|
+
|
|
18
|
+
VERSION_BYTES = b"\x00\x00\x01"
|
|
19
|
+
DELIMITER = "/"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Bit-vector helper (LSB-first packing, matching bitvec<u8, Lsb0>)
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
class _BitVec:
|
|
27
|
+
"""Minimal LSB-first u8 bit vector compatible with Rust bitvec<u8, Lsb0>.
|
|
28
|
+
|
|
29
|
+
Matches Arrow's native validity bitmap layout.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
__slots__ = ("_bytes", "_len")
|
|
33
|
+
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self._bytes = bytearray()
|
|
36
|
+
self._len = 0
|
|
37
|
+
|
|
38
|
+
def push(self, bit: bool) -> None:
|
|
39
|
+
byte_idx = self._len >> 3
|
|
40
|
+
bit_idx = self._len & 7 # LSB-first: bit 0 is least significant
|
|
41
|
+
if byte_idx >= len(self._bytes):
|
|
42
|
+
self._bytes.append(0)
|
|
43
|
+
if bit:
|
|
44
|
+
self._bytes[byte_idx] |= 1 << bit_idx
|
|
45
|
+
self._len += 1
|
|
46
|
+
|
|
47
|
+
def extend_true(self, count: int) -> None:
|
|
48
|
+
for _ in range(count):
|
|
49
|
+
self.push(True)
|
|
50
|
+
|
|
51
|
+
def __len__(self) -> int:
|
|
52
|
+
return self._len
|
|
53
|
+
|
|
54
|
+
def raw_bytes(self) -> bytes:
|
|
55
|
+
return bytes(self._bytes)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
# Schema / DataType serialization (spec Section 2)
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
def _data_type_to_value(dt: pa.DataType) -> object:
|
|
63
|
+
"""Convert a pyarrow DataType to the JSON-compatible value that matches
|
|
64
|
+
the canonical form described in spec Section 2.1.
|
|
65
|
+
|
|
66
|
+
Types are normalized: Utf8→LargeUtf8, Binary→LargeBinary, List→LargeList,
|
|
67
|
+
Dictionary→value_type. Struct fields are sorted alphabetically.
|
|
68
|
+
"""
|
|
69
|
+
import pyarrow as pa
|
|
70
|
+
|
|
71
|
+
# Normalize: Dictionary → recurse on value type
|
|
72
|
+
if pa.types.is_dictionary(dt):
|
|
73
|
+
return _data_type_to_value(dt.value_type)
|
|
74
|
+
|
|
75
|
+
if pa.types.is_struct(dt):
|
|
76
|
+
# Sort struct fields alphabetically by name
|
|
77
|
+
fields = [dt.field(i) for i in range(dt.num_fields)]
|
|
78
|
+
fields.sort(key=lambda f: f.name)
|
|
79
|
+
fields_json = [_inner_field_to_value(f) for f in fields]
|
|
80
|
+
return {"Struct": fields_json}
|
|
81
|
+
if pa.types.is_list(dt) or pa.types.is_large_list(dt):
|
|
82
|
+
return {"LargeList": _element_type_to_value(dt.value_field)}
|
|
83
|
+
if pa.types.is_fixed_size_list(dt):
|
|
84
|
+
return {"FixedSizeList": [_element_type_to_value(dt.value_field), dt.list_size]}
|
|
85
|
+
if pa.types.is_map(dt):
|
|
86
|
+
return {"Map": [_inner_field_to_value(dt.key_field.with_name("entries")), False]}
|
|
87
|
+
|
|
88
|
+
# Primitive / leaf types – must match Arrow-Rust serde
|
|
89
|
+
return _primitive_data_type_string(dt)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _primitive_data_type_string(dt: pa.DataType) -> object:
|
|
93
|
+
"""Return the serde_json representation that arrow-rs produces."""
|
|
94
|
+
import pyarrow as pa
|
|
95
|
+
|
|
96
|
+
_simple = {
|
|
97
|
+
pa.bool_(): "Boolean",
|
|
98
|
+
pa.int8(): "Int8",
|
|
99
|
+
pa.uint8(): "UInt8",
|
|
100
|
+
pa.int16(): "Int16",
|
|
101
|
+
pa.uint16(): "UInt16",
|
|
102
|
+
pa.int32(): "Int32",
|
|
103
|
+
pa.uint32(): "UInt32",
|
|
104
|
+
pa.int64(): "Int64",
|
|
105
|
+
pa.uint64(): "UInt64",
|
|
106
|
+
pa.float16(): "Float16",
|
|
107
|
+
pa.float32(): "Float32",
|
|
108
|
+
pa.float64(): "Float64",
|
|
109
|
+
pa.date32(): "Date32",
|
|
110
|
+
pa.date64(): "Date64",
|
|
111
|
+
pa.utf8(): "LargeUtf8",
|
|
112
|
+
pa.large_utf8(): "LargeUtf8",
|
|
113
|
+
pa.binary(): "LargeBinary",
|
|
114
|
+
pa.large_binary(): "LargeBinary",
|
|
115
|
+
}
|
|
116
|
+
if dt in _simple:
|
|
117
|
+
return _simple[dt]
|
|
118
|
+
|
|
119
|
+
if pa.types.is_decimal(dt):
|
|
120
|
+
if dt.bit_width == 32:
|
|
121
|
+
return {"Decimal32": [dt.precision, dt.scale]}
|
|
122
|
+
if dt.bit_width == 64:
|
|
123
|
+
return {"Decimal64": [dt.precision, dt.scale]}
|
|
124
|
+
if dt.bit_width == 128:
|
|
125
|
+
return {"Decimal128": [dt.precision, dt.scale]}
|
|
126
|
+
if dt.bit_width == 256:
|
|
127
|
+
return {"Decimal256": [dt.precision, dt.scale]}
|
|
128
|
+
|
|
129
|
+
if pa.types.is_time32(dt):
|
|
130
|
+
unit = "Second" if dt.unit == "s" else "Millisecond"
|
|
131
|
+
return {"Time32": unit}
|
|
132
|
+
if pa.types.is_time64(dt):
|
|
133
|
+
unit = "Microsecond" if dt.unit == "us" else "Nanosecond"
|
|
134
|
+
return {"Time64": unit}
|
|
135
|
+
|
|
136
|
+
if pa.types.is_timestamp(dt):
|
|
137
|
+
unit_map = {"s": "Second", "ms": "Millisecond", "us": "Microsecond", "ns": "Nanosecond"}
|
|
138
|
+
unit = unit_map[dt.unit]
|
|
139
|
+
if dt.tz is None:
|
|
140
|
+
return {"Timestamp": [unit, None]}
|
|
141
|
+
return {"Timestamp": [unit, dt.tz]}
|
|
142
|
+
|
|
143
|
+
if pa.types.is_duration(dt):
|
|
144
|
+
unit_map = {"s": "Second", "ms": "Millisecond", "us": "Microsecond", "ns": "Nanosecond"}
|
|
145
|
+
return {"Duration": unit_map[dt.unit]}
|
|
146
|
+
|
|
147
|
+
if pa.types.is_fixed_size_binary(dt):
|
|
148
|
+
return {"FixedSizeBinary": dt.byte_width}
|
|
149
|
+
|
|
150
|
+
raise NotImplementedError(f"Unsupported data type: {dt}")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _inner_field_to_value(field: pa.Field) -> dict:
|
|
154
|
+
"""Convert a field to JSON with name, data_type, and nullable."""
|
|
155
|
+
return {
|
|
156
|
+
"name": field.name,
|
|
157
|
+
"data_type": _data_type_to_value(field.type),
|
|
158
|
+
"nullable": field.nullable,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _element_type_to_value(field: pa.Field) -> dict:
|
|
163
|
+
"""Convert a container element field to JSON with only data_type and nullable (no name).
|
|
164
|
+
|
|
165
|
+
Used for list and fixed-size list element types, matching Rust ``element_type_to_value``.
|
|
166
|
+
"""
|
|
167
|
+
return {
|
|
168
|
+
"data_type": _data_type_to_value(field.type),
|
|
169
|
+
"nullable": field.nullable,
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _sort_json_value(value: object) -> object:
|
|
174
|
+
"""Recursively sort JSON object keys (matching Rust ``sort_json_value``)."""
|
|
175
|
+
if isinstance(value, dict):
|
|
176
|
+
return OrderedDict(sorted((k, _sort_json_value(v)) for k, v in value.items()))
|
|
177
|
+
if isinstance(value, list):
|
|
178
|
+
return [_sort_json_value(v) for v in value]
|
|
179
|
+
return value
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _serialized_schema(schema: pa.Schema) -> str:
|
|
183
|
+
fields: dict[str, object] = {}
|
|
184
|
+
for i in range(len(schema)):
|
|
185
|
+
field = schema.field(i)
|
|
186
|
+
value = {
|
|
187
|
+
"data_type": _data_type_to_value(field.type),
|
|
188
|
+
"nullable": field.nullable,
|
|
189
|
+
}
|
|
190
|
+
fields[field.name] = _sort_json_value(value)
|
|
191
|
+
# Sort by field name (BTreeMap ordering)
|
|
192
|
+
sorted_fields = OrderedDict(sorted(fields.items()))
|
|
193
|
+
return json.dumps(sorted_fields, separators=(",", ":"))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _hash_schema(schema: pa.Schema) -> bytes:
|
|
197
|
+
return hashlib.sha256(_serialized_schema(schema).encode()).digest()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
# DigestBufferType (spec Section 3: null_bits, structural, data)
|
|
202
|
+
#
|
|
203
|
+
# Each entry is a 3-tuple: (BitVec|None, sha256|None, sha256|None)
|
|
204
|
+
# [0] null_bits – present when nullable
|
|
205
|
+
# [1] structural – present for list entries
|
|
206
|
+
# [2] data – present for leaf and list-leaf entries
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def _new_data_only(nullable: bool) -> tuple:
|
|
210
|
+
"""Leaf field entry (spec Section 3 — data-only or validity+data)."""
|
|
211
|
+
return (_BitVec() if nullable else None, None, hashlib.sha256())
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _new_structural_only(nullable: bool) -> tuple:
|
|
215
|
+
"""List-level entry whose value is a struct or nested list (spec Section 3)."""
|
|
216
|
+
return (_BitVec() if nullable else None, hashlib.sha256(), None)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _new_list_leaf(nullable: bool) -> tuple:
|
|
220
|
+
"""List-level entry whose value is a leaf type (spec Section 3)."""
|
|
221
|
+
return (_BitVec() if nullable else None, hashlib.sha256(), hashlib.sha256())
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _new_validity_only() -> tuple:
|
|
225
|
+
"""Nullable parent entry — just null_bits, no structural or data (spec Section 3)."""
|
|
226
|
+
return (_BitVec(), None, None)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
# Type decomposition into BTreeMap entries (spec Sections 3.4, 3.5)
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
def _extract_type_entries(
|
|
234
|
+
data_type: pa.DataType,
|
|
235
|
+
nullable: bool,
|
|
236
|
+
path: str,
|
|
237
|
+
out: dict[str, tuple],
|
|
238
|
+
) -> None:
|
|
239
|
+
"""Recursively decompose a data type into BTreeMap entries.
|
|
240
|
+
|
|
241
|
+
This implements the recursive decomposition described in spec Section 3:
|
|
242
|
+
- Structs are transparent — no entry, recurse into sorted children
|
|
243
|
+
- Lists create validity-only + structural/data entries
|
|
244
|
+
- Leaves create data entries
|
|
245
|
+
"""
|
|
246
|
+
import pyarrow as pa
|
|
247
|
+
canonical = _normalize_data_type(data_type)
|
|
248
|
+
|
|
249
|
+
if pa.types.is_struct(canonical):
|
|
250
|
+
# Struct is transparent — no entry for the struct itself.
|
|
251
|
+
# Recurse into children sorted alphabetically (spec Section 3.5).
|
|
252
|
+
children = [canonical.field(i) for i in range(canonical.num_fields)]
|
|
253
|
+
children.sort(key=lambda f: f.name)
|
|
254
|
+
for child in children:
|
|
255
|
+
child_path = f"{path}{DELIMITER}{child.name}" if path else child.name
|
|
256
|
+
_extract_type_entries(child.type, child.nullable, child_path, out)
|
|
257
|
+
|
|
258
|
+
elif pa.types.is_large_list(canonical) or pa.types.is_list(canonical):
|
|
259
|
+
# Nullable list: validity-only entry at `path` (spec Section 3.4)
|
|
260
|
+
if nullable:
|
|
261
|
+
out[path] = _new_validity_only()
|
|
262
|
+
|
|
263
|
+
# List level: entry at path + "/" (spec Section 3.4)
|
|
264
|
+
list_path = f"{path}{DELIMITER}"
|
|
265
|
+
inner_field = canonical.value_field
|
|
266
|
+
inner_canonical = _normalize_data_type(inner_field.type)
|
|
267
|
+
|
|
268
|
+
if pa.types.is_struct(inner_canonical):
|
|
269
|
+
# List<Struct>: structural-only at list_path, struct children get own entries
|
|
270
|
+
out[list_path] = _new_structural_only(inner_field.nullable)
|
|
271
|
+
_extract_type_entries(inner_field.type, inner_field.nullable, list_path, out)
|
|
272
|
+
elif pa.types.is_large_list(inner_canonical) or pa.types.is_list(inner_canonical):
|
|
273
|
+
# List<List>: structural-only, recurse into inner list
|
|
274
|
+
out[list_path] = _new_structural_only(inner_field.nullable)
|
|
275
|
+
_extract_type_entries(inner_field.type, inner_field.nullable, list_path, out)
|
|
276
|
+
else:
|
|
277
|
+
# List<Leaf>: list-leaf entry with both structural + data
|
|
278
|
+
out[list_path] = _new_list_leaf(inner_field.nullable)
|
|
279
|
+
|
|
280
|
+
else:
|
|
281
|
+
# Leaf type: data entry
|
|
282
|
+
out[path] = _new_data_only(nullable)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _extract_fields(field: pa.Field, parent: str, out: dict[str, tuple]) -> None:
|
|
286
|
+
"""Extract BTreeMap entries from a schema field (record-batch path)."""
|
|
287
|
+
full_name = f"{parent}{DELIMITER}{field.name}" if parent else field.name
|
|
288
|
+
_extract_type_entries(field.type, field.nullable, full_name, out)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
# Type normalization (spec Section 2.1 — type canonicalization)
|
|
293
|
+
# ---------------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
def _normalize_data_type(dt: pa.DataType) -> pa.DataType:
|
|
296
|
+
"""Recursively normalize a DataType to its canonical large equivalent."""
|
|
297
|
+
import pyarrow as pa
|
|
298
|
+
|
|
299
|
+
if pa.types.is_dictionary(dt):
|
|
300
|
+
return _normalize_data_type(dt.value_type)
|
|
301
|
+
if dt == pa.utf8():
|
|
302
|
+
return pa.large_utf8()
|
|
303
|
+
if dt == pa.binary():
|
|
304
|
+
return pa.large_binary()
|
|
305
|
+
if pa.types.is_list(dt) or pa.types.is_large_list(dt):
|
|
306
|
+
inner = _normalize_field(dt.value_field)
|
|
307
|
+
return pa.large_list(inner)
|
|
308
|
+
if pa.types.is_struct(dt):
|
|
309
|
+
fields = [_normalize_field(dt.field(i)) for i in range(dt.num_fields)]
|
|
310
|
+
return pa.struct(fields)
|
|
311
|
+
if pa.types.is_fixed_size_list(dt):
|
|
312
|
+
inner = _normalize_field(dt.value_field)
|
|
313
|
+
return pa.list_(inner, dt.list_size)
|
|
314
|
+
if pa.types.is_map(dt):
|
|
315
|
+
key_field = _normalize_field(dt.key_field)
|
|
316
|
+
item_field = _normalize_field(dt.item_field)
|
|
317
|
+
return pa.map_(key_field.type, item_field.type, keys_sorted=dt.keys_sorted)
|
|
318
|
+
return dt
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _normalize_field(field: pa.Field) -> pa.Field:
|
|
322
|
+
"""Normalize a single field: keep name and nullability, normalize the data type."""
|
|
323
|
+
import pyarrow as pa
|
|
324
|
+
return pa.field(field.name, _normalize_data_type(field.type), field.nullable)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
# Array normalization helper
|
|
329
|
+
# ---------------------------------------------------------------------------
|
|
330
|
+
|
|
331
|
+
def _normalize_array(data_type, array):
|
|
332
|
+
"""Normalize small Arrow variants to their large canonical equivalents.
|
|
333
|
+
|
|
334
|
+
Returns (effective_data_type, effective_array).
|
|
335
|
+
"""
|
|
336
|
+
import pyarrow as pa
|
|
337
|
+
|
|
338
|
+
if pa.types.is_string(data_type) and not pa.types.is_large_string(data_type):
|
|
339
|
+
return pa.large_utf8(), array.cast(pa.large_utf8())
|
|
340
|
+
if pa.types.is_binary(data_type) and not pa.types.is_large_binary(data_type):
|
|
341
|
+
return pa.large_binary(), array.cast(pa.large_binary())
|
|
342
|
+
if pa.types.is_list(data_type) and not pa.types.is_large_list(data_type):
|
|
343
|
+
target = pa.large_list(data_type.value_field)
|
|
344
|
+
return target, array.cast(target)
|
|
345
|
+
if pa.types.is_dictionary(data_type):
|
|
346
|
+
effective_type = data_type.value_type
|
|
347
|
+
return _normalize_array(effective_type, array.cast(effective_type))
|
|
348
|
+
return data_type, array
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
# ---------------------------------------------------------------------------
|
|
352
|
+
# Recursive traversal — populates BTreeMap entries from array data
|
|
353
|
+
# (spec Sections 3.1–3.5)
|
|
354
|
+
# ---------------------------------------------------------------------------
|
|
355
|
+
|
|
356
|
+
def _combine_null_masks(own_valid, ancestor_valid):
|
|
357
|
+
"""AND-combine two validity lists. Returns None if all valid."""
|
|
358
|
+
if own_valid is None and ancestor_valid is None:
|
|
359
|
+
return None
|
|
360
|
+
if own_valid is None:
|
|
361
|
+
return ancestor_valid
|
|
362
|
+
if ancestor_valid is None:
|
|
363
|
+
return own_valid
|
|
364
|
+
return [a and b for a, b in zip(own_valid, ancestor_valid)]
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _get_validity_list(array):
|
|
368
|
+
"""Return a list of bools (True=valid) or None if no nulls."""
|
|
369
|
+
if array.null_count == 0:
|
|
370
|
+
return None
|
|
371
|
+
return [array[i].is_valid for i in range(len(array))]
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _traverse_and_update(data_type, nullable, array, path, ancestor_nulls, fields):
|
|
375
|
+
"""Top-down recursive traversal that routes data to BTreeMap entries.
|
|
376
|
+
|
|
377
|
+
Parameters:
|
|
378
|
+
data_type: Arrow data type of the array
|
|
379
|
+
nullable: whether this position is nullable
|
|
380
|
+
array: the Arrow array to hash
|
|
381
|
+
path: current BTreeMap key path
|
|
382
|
+
ancestor_nulls: list of bools from ancestor struct nulls, or None
|
|
383
|
+
fields: the BTreeMap of entries to populate
|
|
384
|
+
"""
|
|
385
|
+
import pyarrow as pa
|
|
386
|
+
|
|
387
|
+
effective_type, effective_array = _normalize_array(data_type, array)
|
|
388
|
+
canonical = _normalize_data_type(effective_type)
|
|
389
|
+
|
|
390
|
+
if pa.types.is_large_list(canonical):
|
|
391
|
+
_traverse_list(effective_array, canonical.value_field, nullable, path, ancestor_nulls, fields)
|
|
392
|
+
elif pa.types.is_struct(canonical):
|
|
393
|
+
_traverse_struct(effective_array, nullable, path, ancestor_nulls, fields)
|
|
394
|
+
else:
|
|
395
|
+
_traverse_leaf(effective_type, effective_array, path, ancestor_nulls, fields)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _traverse_list(array, value_field, nullable, path, ancestor_nulls, fields):
|
|
399
|
+
"""Traverse a list array, populating validity/structural/data entries (spec Section 3.4)."""
|
|
400
|
+
import pyarrow as pa
|
|
401
|
+
|
|
402
|
+
# If nullable, record list-level validity at `path`
|
|
403
|
+
if nullable:
|
|
404
|
+
entry = fields.get(path)
|
|
405
|
+
if entry is not None:
|
|
406
|
+
null_bits = entry[0]
|
|
407
|
+
if null_bits is not None:
|
|
408
|
+
own_valid = _get_validity_list(array)
|
|
409
|
+
effective = _combine_null_masks(own_valid, ancestor_nulls)
|
|
410
|
+
if effective is not None:
|
|
411
|
+
for v in effective:
|
|
412
|
+
null_bits.push(v)
|
|
413
|
+
else:
|
|
414
|
+
null_bits.extend_true(len(array))
|
|
415
|
+
|
|
416
|
+
list_path = f"{path}{DELIMITER}"
|
|
417
|
+
|
|
418
|
+
# Determine effective null buffer for skipping null list elements
|
|
419
|
+
own_valid = _get_validity_list(array)
|
|
420
|
+
effective_nulls = _combine_null_masks(own_valid, ancestor_nulls)
|
|
421
|
+
|
|
422
|
+
# For each row, write structural info and recurse into non-null elements
|
|
423
|
+
offsets = array.offsets
|
|
424
|
+
for i in range(len(array)):
|
|
425
|
+
is_valid = effective_nulls is None or effective_nulls[i]
|
|
426
|
+
if is_valid:
|
|
427
|
+
start = offsets[i].as_py()
|
|
428
|
+
end = offsets[i + 1].as_py()
|
|
429
|
+
sub_array = array.values.slice(start, end - start)
|
|
430
|
+
sub_len = len(sub_array)
|
|
431
|
+
|
|
432
|
+
# Write list length to structural digest at list_path
|
|
433
|
+
entry = fields.get(list_path)
|
|
434
|
+
if entry is not None and entry[1] is not None:
|
|
435
|
+
entry[1].update(struct.pack("<Q", sub_len))
|
|
436
|
+
|
|
437
|
+
# Recurse into the sub-array using the value field's type
|
|
438
|
+
_traverse_and_update(
|
|
439
|
+
value_field.type,
|
|
440
|
+
value_field.nullable,
|
|
441
|
+
sub_array,
|
|
442
|
+
list_path,
|
|
443
|
+
None, # list elements don't inherit ancestor struct nulls
|
|
444
|
+
fields,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _traverse_struct(array, nullable, path, ancestor_nulls, fields):
|
|
449
|
+
"""Traverse a struct array — struct is transparent (spec Section 3.5).
|
|
450
|
+
|
|
451
|
+
Struct-level nulls are AND-propagated to all descendant entries.
|
|
452
|
+
"""
|
|
453
|
+
import pyarrow as pa
|
|
454
|
+
|
|
455
|
+
struct_array = array
|
|
456
|
+
# Combine struct's own nulls with ancestor nulls (AND propagation)
|
|
457
|
+
if nullable:
|
|
458
|
+
combined = _combine_null_masks(_get_validity_list(struct_array), ancestor_nulls)
|
|
459
|
+
else:
|
|
460
|
+
combined = ancestor_nulls
|
|
461
|
+
|
|
462
|
+
# Sort children alphabetically by field name
|
|
463
|
+
children = [(i, struct_array.type.field(i)) for i in range(struct_array.type.num_fields)]
|
|
464
|
+
children.sort(key=lambda x: x[1].name)
|
|
465
|
+
|
|
466
|
+
for idx, child_field in children:
|
|
467
|
+
child_array = struct_array.field(idx)
|
|
468
|
+
child_path = f"{path}{DELIMITER}{child_field.name}" if path else child_field.name
|
|
469
|
+
_traverse_and_update(
|
|
470
|
+
child_field.type,
|
|
471
|
+
child_field.nullable,
|
|
472
|
+
child_array,
|
|
473
|
+
child_path,
|
|
474
|
+
combined,
|
|
475
|
+
fields,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _traverse_leaf(data_type, array, path, ancestor_nulls, fields):
|
|
480
|
+
"""Traverse a leaf array — hash data into its BTreeMap entry (spec Sections 3.1–3.3)."""
|
|
481
|
+
entry = fields.get(path)
|
|
482
|
+
if entry is None:
|
|
483
|
+
return
|
|
484
|
+
|
|
485
|
+
null_bits, _structural, data_digest = entry
|
|
486
|
+
if data_digest is None:
|
|
487
|
+
return
|
|
488
|
+
|
|
489
|
+
# Compute effective validity (own nulls AND ancestor struct nulls)
|
|
490
|
+
own_valid = _get_validity_list(array)
|
|
491
|
+
effective = _combine_null_masks(own_valid, ancestor_nulls)
|
|
492
|
+
|
|
493
|
+
# Push effective validity to null_bits
|
|
494
|
+
if null_bits is not None:
|
|
495
|
+
if effective is not None:
|
|
496
|
+
for v in effective:
|
|
497
|
+
null_bits.push(v)
|
|
498
|
+
else:
|
|
499
|
+
null_bits.extend_true(len(array))
|
|
500
|
+
|
|
501
|
+
# Hash leaf data, skipping null elements
|
|
502
|
+
_hash_leaf_data(data_type, array, data_digest, effective)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _hash_leaf_data(data_type, array, data_digest, effective_nulls):
|
|
506
|
+
"""Hash leaf-level data bytes into the data digest (spec Sections 3.1–3.3)."""
|
|
507
|
+
import pyarrow as pa
|
|
508
|
+
|
|
509
|
+
if pa.types.is_boolean(data_type):
|
|
510
|
+
_hash_boolean_data(array, data_digest, effective_nulls)
|
|
511
|
+
elif pa.types.is_large_binary(data_type):
|
|
512
|
+
_hash_binary_data(array, data_digest, effective_nulls)
|
|
513
|
+
elif pa.types.is_large_string(data_type):
|
|
514
|
+
_hash_string_data(array, data_digest, effective_nulls)
|
|
515
|
+
else:
|
|
516
|
+
element_size = _element_size_for_type(data_type)
|
|
517
|
+
if element_size is not None:
|
|
518
|
+
_hash_fixed_size_data(array, data_digest, element_size, effective_nulls)
|
|
519
|
+
else:
|
|
520
|
+
raise NotImplementedError(f"Unsupported leaf type: {data_type}")
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
# ---------------------------------------------------------------------------
|
|
524
|
+
# Leaf data hashing (spec Sections 3.1–3.3)
|
|
525
|
+
# ---------------------------------------------------------------------------
|
|
526
|
+
|
|
527
|
+
def _element_size_for_type(dt: pa.DataType) -> int | None:
|
|
528
|
+
"""Return byte width for fixed-size types, or None for variable-length."""
|
|
529
|
+
import pyarrow as pa
|
|
530
|
+
|
|
531
|
+
_sizes = {
|
|
532
|
+
pa.int8(): 1, pa.uint8(): 1,
|
|
533
|
+
pa.int16(): 2, pa.uint16(): 2, pa.float16(): 2,
|
|
534
|
+
pa.int32(): 4, pa.uint32(): 4, pa.float32(): 4, pa.date32(): 4,
|
|
535
|
+
pa.int64(): 8, pa.uint64(): 8, pa.float64(): 8, pa.date64(): 8,
|
|
536
|
+
}
|
|
537
|
+
if dt in _sizes:
|
|
538
|
+
return _sizes[dt]
|
|
539
|
+
if pa.types.is_time32(dt):
|
|
540
|
+
return 4
|
|
541
|
+
if pa.types.is_time64(dt):
|
|
542
|
+
return 8
|
|
543
|
+
if pa.types.is_decimal(dt):
|
|
544
|
+
return dt.bit_width // 8
|
|
545
|
+
if pa.types.is_fixed_size_binary(dt):
|
|
546
|
+
return dt.byte_width
|
|
547
|
+
return None
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _hash_fixed_size_data(arr, data_digest, element_size: int, effective_nulls) -> None:
|
|
551
|
+
"""Hash a fixed-width array's data bytes (spec Section 3.1)."""
|
|
552
|
+
bufs = arr.buffers()
|
|
553
|
+
data_buf = bufs[1]
|
|
554
|
+
offset = arr.offset
|
|
555
|
+
|
|
556
|
+
raw = data_buf.to_pybytes()
|
|
557
|
+
start = offset * element_size
|
|
558
|
+
|
|
559
|
+
if effective_nulls is None:
|
|
560
|
+
# Non-nullable or all valid: feed entire contiguous buffer
|
|
561
|
+
end = start + len(arr) * element_size
|
|
562
|
+
data_digest.update(raw[start:end])
|
|
563
|
+
else:
|
|
564
|
+
# Nullable: feed only valid elements
|
|
565
|
+
has_nulls = any(not v for v in effective_nulls)
|
|
566
|
+
if has_nulls:
|
|
567
|
+
for i in range(len(arr)):
|
|
568
|
+
if effective_nulls[i]:
|
|
569
|
+
pos = start + i * element_size
|
|
570
|
+
data_digest.update(raw[pos:pos + element_size])
|
|
571
|
+
else:
|
|
572
|
+
end = start + len(arr) * element_size
|
|
573
|
+
data_digest.update(raw[start:end])
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def _hash_boolean_data(arr, data_digest, effective_nulls) -> None:
|
|
577
|
+
"""Hash boolean array data bits (spec Section 3.2)."""
|
|
578
|
+
bv = _BitVec()
|
|
579
|
+
if effective_nulls is None:
|
|
580
|
+
for i in range(len(arr)):
|
|
581
|
+
bv.push(arr[i].as_py())
|
|
582
|
+
else:
|
|
583
|
+
for i in range(len(arr)):
|
|
584
|
+
if effective_nulls[i]:
|
|
585
|
+
bv.push(arr[i].as_py())
|
|
586
|
+
data_digest.update(bv.raw_bytes())
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _hash_binary_data(arr, data_digest, effective_nulls) -> None:
|
|
590
|
+
"""Hash binary array data (spec Section 3.3)."""
|
|
591
|
+
if effective_nulls is None:
|
|
592
|
+
for i in range(len(arr)):
|
|
593
|
+
val = arr[i].as_py()
|
|
594
|
+
data_digest.update(struct.pack("<Q", len(val)))
|
|
595
|
+
data_digest.update(val)
|
|
596
|
+
else:
|
|
597
|
+
for i in range(len(arr)):
|
|
598
|
+
if effective_nulls[i]:
|
|
599
|
+
val = arr[i].as_py()
|
|
600
|
+
data_digest.update(struct.pack("<Q", len(val)))
|
|
601
|
+
data_digest.update(val)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _hash_string_data(arr, data_digest, effective_nulls) -> None:
|
|
605
|
+
"""Hash string array data (spec Section 3.3)."""
|
|
606
|
+
if effective_nulls is None:
|
|
607
|
+
for i in range(len(arr)):
|
|
608
|
+
val = arr[i].as_py().encode("utf-8")
|
|
609
|
+
data_digest.update(struct.pack("<Q", len(val)))
|
|
610
|
+
data_digest.update(val)
|
|
611
|
+
else:
|
|
612
|
+
for i in range(len(arr)):
|
|
613
|
+
if effective_nulls[i]:
|
|
614
|
+
val = arr[i].as_py().encode("utf-8")
|
|
615
|
+
data_digest.update(struct.pack("<Q", len(val)))
|
|
616
|
+
data_digest.update(val)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
# ---------------------------------------------------------------------------
|
|
620
|
+
# Finalization (spec Section 4)
|
|
621
|
+
# ---------------------------------------------------------------------------
|
|
622
|
+
|
|
623
|
+
def _finalize_digest(final_digest, entry: tuple) -> None:
|
|
624
|
+
"""Finalize a single BTreeMap entry into the final combining digest (spec Section 4)."""
|
|
625
|
+
null_bits, structural, data = entry
|
|
626
|
+
|
|
627
|
+
# 1. null_bits (if present — nullable entries only)
|
|
628
|
+
if null_bits is not None:
|
|
629
|
+
final_digest.update(struct.pack("<Q", len(null_bits)))
|
|
630
|
+
for b in null_bits.raw_bytes():
|
|
631
|
+
final_digest.update(bytes([b]))
|
|
632
|
+
|
|
633
|
+
# 2. structural (if present — list entries only)
|
|
634
|
+
if structural is not None:
|
|
635
|
+
final_digest.update(structural.digest())
|
|
636
|
+
|
|
637
|
+
# 3. data (if present — leaf and list-leaf entries only)
|
|
638
|
+
if data is not None:
|
|
639
|
+
final_digest.update(data.digest())
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
# ---------------------------------------------------------------------------
|
|
643
|
+
# Public API
|
|
644
|
+
# ---------------------------------------------------------------------------
|
|
645
|
+
|
|
646
|
+
class ArrowDigester:
|
|
647
|
+
"""Pure-Python equivalent of the Rust ``ArrowDigester``.
|
|
648
|
+
|
|
649
|
+
Produces identical SHA-256 hashes with a 3-byte version prefix.
|
|
650
|
+
"""
|
|
651
|
+
|
|
652
|
+
def __init__(self, schema: pa.Schema) -> None:
|
|
653
|
+
self._schema = schema
|
|
654
|
+
self._schema_digest = _hash_schema(schema)
|
|
655
|
+
# BTreeMap<path, (BitVec|None, sha256|None, sha256|None)> — sorted by key
|
|
656
|
+
self._fields: dict[str, tuple] = {}
|
|
657
|
+
for i in range(len(schema)):
|
|
658
|
+
_extract_fields(schema.field(i), "", self._fields)
|
|
659
|
+
# Ensure sorted order (Python 3.7+ dicts are insertion-ordered)
|
|
660
|
+
self._fields = dict(sorted(self._fields.items()))
|
|
661
|
+
|
|
662
|
+
def update(self, record_batch: pa.RecordBatch) -> None:
|
|
663
|
+
"""Feed a RecordBatch into the running digest (spec Sections 3–5)."""
|
|
664
|
+
# Build a mapping from top-level column name to (field, array)
|
|
665
|
+
schema = record_batch.schema
|
|
666
|
+
# Traverse each top-level field using the recursive traversal
|
|
667
|
+
for i in range(len(schema)):
|
|
668
|
+
field = schema.field(i)
|
|
669
|
+
col = record_batch.column(i)
|
|
670
|
+
_traverse_and_update(
|
|
671
|
+
field.type,
|
|
672
|
+
field.nullable,
|
|
673
|
+
col,
|
|
674
|
+
field.name,
|
|
675
|
+
None, # no ancestor struct nulls at top level
|
|
676
|
+
self._fields,
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
def finalize(self) -> bytes:
|
|
680
|
+
"""Consume the digester and return the versioned hash (spec Section 5)."""
|
|
681
|
+
final_digest = hashlib.sha256()
|
|
682
|
+
final_digest.update(self._schema_digest)
|
|
683
|
+
for _path, entry in sorted(self._fields.items()):
|
|
684
|
+
_finalize_digest(final_digest, entry)
|
|
685
|
+
return VERSION_BYTES + final_digest.digest()
|
|
686
|
+
|
|
687
|
+
# -- Convenience class methods ------------------------------------------
|
|
688
|
+
|
|
689
|
+
@staticmethod
|
|
690
|
+
def hash_schema(schema: pa.Schema) -> bytes:
|
|
691
|
+
return VERSION_BYTES + _hash_schema(schema)
|
|
692
|
+
|
|
693
|
+
@staticmethod
|
|
694
|
+
def hash_record_batch(record_batch: pa.RecordBatch) -> bytes:
|
|
695
|
+
d = ArrowDigester(record_batch.schema)
|
|
696
|
+
d.update(record_batch)
|
|
697
|
+
return d.finalize()
|
|
698
|
+
|
|
699
|
+
@staticmethod
|
|
700
|
+
def hash_table(table: pa.Table) -> bytes:
|
|
701
|
+
"""Hash a full table (iterates over all batches)."""
|
|
702
|
+
d = ArrowDigester(table.schema)
|
|
703
|
+
for batch in table.to_batches():
|
|
704
|
+
d.update(batch)
|
|
705
|
+
return d.finalize()
|
|
706
|
+
|
|
707
|
+
@staticmethod
|
|
708
|
+
def hash_array(array: pa.Array) -> bytes:
|
|
709
|
+
"""Hash a single array (spec Section 6).
|
|
710
|
+
|
|
711
|
+
Uses the same recursive BTreeMap decomposition as the record-batch path.
|
|
712
|
+
"""
|
|
713
|
+
import pyarrow as pa
|
|
714
|
+
|
|
715
|
+
# Resolve dictionary arrays to their plain value type
|
|
716
|
+
effective_type = array.type
|
|
717
|
+
effective_array = array
|
|
718
|
+
if pa.types.is_dictionary(effective_type):
|
|
719
|
+
effective_type = effective_type.value_type
|
|
720
|
+
effective_array = array.cast(effective_type)
|
|
721
|
+
|
|
722
|
+
# Normalize to canonical large types
|
|
723
|
+
normalized_type = _normalize_data_type(effective_type)
|
|
724
|
+
|
|
725
|
+
# Step 1: Type metadata (canonical JSON string)
|
|
726
|
+
dt_value = _data_type_to_value(normalized_type)
|
|
727
|
+
dt_value = _sort_json_value(dt_value)
|
|
728
|
+
dt_json = json.dumps(dt_value, separators=(",", ":"))
|
|
729
|
+
|
|
730
|
+
final_digest = hashlib.sha256()
|
|
731
|
+
final_digest.update(dt_json.encode())
|
|
732
|
+
|
|
733
|
+
# Determine nullability: arrays with null_count > 0 are nullable
|
|
734
|
+
nullable = effective_array.null_count > 0
|
|
735
|
+
|
|
736
|
+
# Step 2: Build BTreeMap entries from the type tree (same as record-batch)
|
|
737
|
+
fields: dict[str, tuple] = {}
|
|
738
|
+
_extract_type_entries(effective_type, nullable, "", fields)
|
|
739
|
+
fields = dict(sorted(fields.items()))
|
|
740
|
+
|
|
741
|
+
# Step 3: Traverse and populate entries
|
|
742
|
+
_traverse_and_update(
|
|
743
|
+
effective_type,
|
|
744
|
+
nullable,
|
|
745
|
+
effective_array,
|
|
746
|
+
"",
|
|
747
|
+
None,
|
|
748
|
+
fields,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Step 4: Finalize all entries into the digest
|
|
752
|
+
for _path, entry in sorted(fields.items()):
|
|
753
|
+
_finalize_digest(final_digest, entry)
|
|
754
|
+
|
|
755
|
+
return VERSION_BYTES + final_digest.digest()
|
starfix/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: starfix
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pure-Python implementation of starfix Arrow logical hasher
|
|
5
|
+
Author: nauticalab
|
|
6
|
+
License-Expression: MIT OR Apache-2.0
|
|
7
|
+
Keywords: arrow,hashing
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# starfix-python
|
|
14
|
+
|
|
15
|
+
Pure-Python implementation of the [starfix](https://github.com/nauticalab/starfix) Arrow logical hasher.
|
|
16
|
+
|
|
17
|
+
Produces stable SHA-256 hashes of Arrow tables, record batches, and arrays that are:
|
|
18
|
+
|
|
19
|
+
- **Column-order independent** — reordering columns does not change the hash
|
|
20
|
+
- **Batch-split independent** — splitting data across batches does not change the hash
|
|
21
|
+
- **Cross-language compatible** — identical hashes to the Rust implementation
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install starfix
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import pyarrow as pa
|
|
33
|
+
from starfix import ArrowDigester
|
|
34
|
+
|
|
35
|
+
schema = pa.schema([
|
|
36
|
+
pa.field("id", pa.int32(), nullable=False),
|
|
37
|
+
pa.field("value", pa.float64(), nullable=True),
|
|
38
|
+
])
|
|
39
|
+
|
|
40
|
+
# Hash a full table
|
|
41
|
+
table = pa.table({"id": [1, 2, 3], "value": [1.1, 2.2, 3.3]}, schema=schema)
|
|
42
|
+
digest = ArrowDigester.hash_table(table)
|
|
43
|
+
|
|
44
|
+
# Streaming: feed record batches incrementally
|
|
45
|
+
digester = ArrowDigester(schema)
|
|
46
|
+
for batch in batches:
|
|
47
|
+
digester.update(batch)
|
|
48
|
+
digest = digester.finalize()
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
MIT OR Apache-2.0
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
starfix/__init__.py,sha256=UVQ-QdCuc3rRT4GVmu2qcDOooXw7Gy4xIlrgwArD3Jo,78
|
|
2
|
+
starfix/_version.py,sha256=n_5vdJsPNu7wZ57LGuRL585uvll-hiuvZUBWzdG0RQU,520
|
|
3
|
+
starfix/arrow_digester.py,sha256=5iJsRg8pshnbxgwjQbYzjSQ2qd-Y7Or_qwMGS2WGmB0,28179
|
|
4
|
+
starfix/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
starfix-0.1.0.dist-info/METADATA,sha256=2nqMhI6yz1hm5HuvqSwjPzHmhJAYpxjNU8uN4r_-7-A,1386
|
|
6
|
+
starfix-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
7
|
+
starfix-0.1.0.dist-info/RECORD,,
|