starfix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
starfix/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from starfix.arrow_digester import ArrowDigester
2
+
3
+ __all__ = ["ArrowDigester"]
starfix/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.1.0'
22
+ __version_tuple__ = version_tuple = (0, 1, 0)
23
+
24
+ __commit_id__ = commit_id = None
@@ -0,0 +1,755 @@
1
+ """Pure-Python implementation of the starfix Arrow logical hasher.
2
+
3
+ Implements the byte-layout specification defined in the starfix Rust crate
4
+ (``nauticalab/starfix docs/byte-layout-spec.md``).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ import struct
12
+ from collections import OrderedDict
13
+ from typing import TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ import pyarrow as pa
17
+
18
+ VERSION_BYTES = b"\x00\x00\x01"
19
+ DELIMITER = "/"
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Bit-vector helper (LSB-first packing, matching bitvec<u8, Lsb0>)
24
+ # ---------------------------------------------------------------------------
25
+
26
+ class _BitVec:
27
+ """Minimal LSB-first u8 bit vector compatible with Rust bitvec<u8, Lsb0>.
28
+
29
+ Matches Arrow's native validity bitmap layout.
30
+ """
31
+
32
+ __slots__ = ("_bytes", "_len")
33
+
34
+ def __init__(self) -> None:
35
+ self._bytes = bytearray()
36
+ self._len = 0
37
+
38
+ def push(self, bit: bool) -> None:
39
+ byte_idx = self._len >> 3
40
+ bit_idx = self._len & 7 # LSB-first: bit 0 is least significant
41
+ if byte_idx >= len(self._bytes):
42
+ self._bytes.append(0)
43
+ if bit:
44
+ self._bytes[byte_idx] |= 1 << bit_idx
45
+ self._len += 1
46
+
47
+ def extend_true(self, count: int) -> None:
48
+ for _ in range(count):
49
+ self.push(True)
50
+
51
+ def __len__(self) -> int:
52
+ return self._len
53
+
54
+ def raw_bytes(self) -> bytes:
55
+ return bytes(self._bytes)
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Schema / DataType serialization (spec Section 2)
60
+ # ---------------------------------------------------------------------------
61
+
62
+ def _data_type_to_value(dt: pa.DataType) -> object:
63
+ """Convert a pyarrow DataType to the JSON-compatible value that matches
64
+ the canonical form described in spec Section 2.1.
65
+
66
+ Types are normalized: Utf8→LargeUtf8, Binary→LargeBinary, List→LargeList,
67
+ Dictionary→value_type. Struct fields are sorted alphabetically.
68
+ """
69
+ import pyarrow as pa
70
+
71
+ # Normalize: Dictionary → recurse on value type
72
+ if pa.types.is_dictionary(dt):
73
+ return _data_type_to_value(dt.value_type)
74
+
75
+ if pa.types.is_struct(dt):
76
+ # Sort struct fields alphabetically by name
77
+ fields = [dt.field(i) for i in range(dt.num_fields)]
78
+ fields.sort(key=lambda f: f.name)
79
+ fields_json = [_inner_field_to_value(f) for f in fields]
80
+ return {"Struct": fields_json}
81
+ if pa.types.is_list(dt) or pa.types.is_large_list(dt):
82
+ return {"LargeList": _element_type_to_value(dt.value_field)}
83
+ if pa.types.is_fixed_size_list(dt):
84
+ return {"FixedSizeList": [_element_type_to_value(dt.value_field), dt.list_size]}
85
+ if pa.types.is_map(dt):
86
+ return {"Map": [_inner_field_to_value(dt.key_field.with_name("entries")), False]}
87
+
88
+ # Primitive / leaf types – must match Arrow-Rust serde
89
+ return _primitive_data_type_string(dt)
90
+
91
+
92
+ def _primitive_data_type_string(dt: pa.DataType) -> object:
93
+ """Return the serde_json representation that arrow-rs produces."""
94
+ import pyarrow as pa
95
+
96
+ _simple = {
97
+ pa.bool_(): "Boolean",
98
+ pa.int8(): "Int8",
99
+ pa.uint8(): "UInt8",
100
+ pa.int16(): "Int16",
101
+ pa.uint16(): "UInt16",
102
+ pa.int32(): "Int32",
103
+ pa.uint32(): "UInt32",
104
+ pa.int64(): "Int64",
105
+ pa.uint64(): "UInt64",
106
+ pa.float16(): "Float16",
107
+ pa.float32(): "Float32",
108
+ pa.float64(): "Float64",
109
+ pa.date32(): "Date32",
110
+ pa.date64(): "Date64",
111
+ pa.utf8(): "LargeUtf8",
112
+ pa.large_utf8(): "LargeUtf8",
113
+ pa.binary(): "LargeBinary",
114
+ pa.large_binary(): "LargeBinary",
115
+ }
116
+ if dt in _simple:
117
+ return _simple[dt]
118
+
119
+ if pa.types.is_decimal(dt):
120
+ if dt.bit_width == 32:
121
+ return {"Decimal32": [dt.precision, dt.scale]}
122
+ if dt.bit_width == 64:
123
+ return {"Decimal64": [dt.precision, dt.scale]}
124
+ if dt.bit_width == 128:
125
+ return {"Decimal128": [dt.precision, dt.scale]}
126
+ if dt.bit_width == 256:
127
+ return {"Decimal256": [dt.precision, dt.scale]}
128
+
129
+ if pa.types.is_time32(dt):
130
+ unit = "Second" if dt.unit == "s" else "Millisecond"
131
+ return {"Time32": unit}
132
+ if pa.types.is_time64(dt):
133
+ unit = "Microsecond" if dt.unit == "us" else "Nanosecond"
134
+ return {"Time64": unit}
135
+
136
+ if pa.types.is_timestamp(dt):
137
+ unit_map = {"s": "Second", "ms": "Millisecond", "us": "Microsecond", "ns": "Nanosecond"}
138
+ unit = unit_map[dt.unit]
139
+ if dt.tz is None:
140
+ return {"Timestamp": [unit, None]}
141
+ return {"Timestamp": [unit, dt.tz]}
142
+
143
+ if pa.types.is_duration(dt):
144
+ unit_map = {"s": "Second", "ms": "Millisecond", "us": "Microsecond", "ns": "Nanosecond"}
145
+ return {"Duration": unit_map[dt.unit]}
146
+
147
+ if pa.types.is_fixed_size_binary(dt):
148
+ return {"FixedSizeBinary": dt.byte_width}
149
+
150
+ raise NotImplementedError(f"Unsupported data type: {dt}")
151
+
152
+
153
+ def _inner_field_to_value(field: pa.Field) -> dict:
154
+ """Convert a field to JSON with name, data_type, and nullable."""
155
+ return {
156
+ "name": field.name,
157
+ "data_type": _data_type_to_value(field.type),
158
+ "nullable": field.nullable,
159
+ }
160
+
161
+
162
+ def _element_type_to_value(field: pa.Field) -> dict:
163
+ """Convert a container element field to JSON with only data_type and nullable (no name).
164
+
165
+ Used for list and fixed-size list element types, matching Rust ``element_type_to_value``.
166
+ """
167
+ return {
168
+ "data_type": _data_type_to_value(field.type),
169
+ "nullable": field.nullable,
170
+ }
171
+
172
+
173
+ def _sort_json_value(value: object) -> object:
174
+ """Recursively sort JSON object keys (matching Rust ``sort_json_value``)."""
175
+ if isinstance(value, dict):
176
+ return OrderedDict(sorted((k, _sort_json_value(v)) for k, v in value.items()))
177
+ if isinstance(value, list):
178
+ return [_sort_json_value(v) for v in value]
179
+ return value
180
+
181
+
182
+ def _serialized_schema(schema: pa.Schema) -> str:
183
+ fields: dict[str, object] = {}
184
+ for i in range(len(schema)):
185
+ field = schema.field(i)
186
+ value = {
187
+ "data_type": _data_type_to_value(field.type),
188
+ "nullable": field.nullable,
189
+ }
190
+ fields[field.name] = _sort_json_value(value)
191
+ # Sort by field name (BTreeMap ordering)
192
+ sorted_fields = OrderedDict(sorted(fields.items()))
193
+ return json.dumps(sorted_fields, separators=(",", ":"))
194
+
195
+
196
+ def _hash_schema(schema: pa.Schema) -> bytes:
197
+ return hashlib.sha256(_serialized_schema(schema).encode()).digest()
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # DigestBufferType (spec Section 3: null_bits, structural, data)
202
+ #
203
+ # Each entry is a 3-tuple: (BitVec|None, sha256|None, sha256|None)
204
+ # [0] null_bits – present when nullable
205
+ # [1] structural – present for list entries
206
+ # [2] data – present for leaf and list-leaf entries
207
+ # ---------------------------------------------------------------------------
208
+
209
+ def _new_data_only(nullable: bool) -> tuple:
210
+ """Leaf field entry (spec Section 3 — data-only or validity+data)."""
211
+ return (_BitVec() if nullable else None, None, hashlib.sha256())
212
+
213
+
214
+ def _new_structural_only(nullable: bool) -> tuple:
215
+ """List-level entry whose value is a struct or nested list (spec Section 3)."""
216
+ return (_BitVec() if nullable else None, hashlib.sha256(), None)
217
+
218
+
219
+ def _new_list_leaf(nullable: bool) -> tuple:
220
+ """List-level entry whose value is a leaf type (spec Section 3)."""
221
+ return (_BitVec() if nullable else None, hashlib.sha256(), hashlib.sha256())
222
+
223
+
224
+ def _new_validity_only() -> tuple:
225
+ """Nullable parent entry — just null_bits, no structural or data (spec Section 3)."""
226
+ return (_BitVec(), None, None)
227
+
228
+
229
+ # ---------------------------------------------------------------------------
230
+ # Type decomposition into BTreeMap entries (spec Sections 3.4, 3.5)
231
+ # ---------------------------------------------------------------------------
232
+
233
+ def _extract_type_entries(
234
+ data_type: pa.DataType,
235
+ nullable: bool,
236
+ path: str,
237
+ out: dict[str, tuple],
238
+ ) -> None:
239
+ """Recursively decompose a data type into BTreeMap entries.
240
+
241
+ This implements the recursive decomposition described in spec Section 3:
242
+ - Structs are transparent — no entry, recurse into sorted children
243
+ - Lists create validity-only + structural/data entries
244
+ - Leaves create data entries
245
+ """
246
+ import pyarrow as pa
247
+ canonical = _normalize_data_type(data_type)
248
+
249
+ if pa.types.is_struct(canonical):
250
+ # Struct is transparent — no entry for the struct itself.
251
+ # Recurse into children sorted alphabetically (spec Section 3.5).
252
+ children = [canonical.field(i) for i in range(canonical.num_fields)]
253
+ children.sort(key=lambda f: f.name)
254
+ for child in children:
255
+ child_path = f"{path}{DELIMITER}{child.name}" if path else child.name
256
+ _extract_type_entries(child.type, child.nullable, child_path, out)
257
+
258
+ elif pa.types.is_large_list(canonical) or pa.types.is_list(canonical):
259
+ # Nullable list: validity-only entry at `path` (spec Section 3.4)
260
+ if nullable:
261
+ out[path] = _new_validity_only()
262
+
263
+ # List level: entry at path + "/" (spec Section 3.4)
264
+ list_path = f"{path}{DELIMITER}"
265
+ inner_field = canonical.value_field
266
+ inner_canonical = _normalize_data_type(inner_field.type)
267
+
268
+ if pa.types.is_struct(inner_canonical):
269
+ # List<Struct>: structural-only at list_path, struct children get own entries
270
+ out[list_path] = _new_structural_only(inner_field.nullable)
271
+ _extract_type_entries(inner_field.type, inner_field.nullable, list_path, out)
272
+ elif pa.types.is_large_list(inner_canonical) or pa.types.is_list(inner_canonical):
273
+ # List<List>: structural-only, recurse into inner list
274
+ out[list_path] = _new_structural_only(inner_field.nullable)
275
+ _extract_type_entries(inner_field.type, inner_field.nullable, list_path, out)
276
+ else:
277
+ # List<Leaf>: list-leaf entry with both structural + data
278
+ out[list_path] = _new_list_leaf(inner_field.nullable)
279
+
280
+ else:
281
+ # Leaf type: data entry
282
+ out[path] = _new_data_only(nullable)
283
+
284
+
285
+ def _extract_fields(field: pa.Field, parent: str, out: dict[str, tuple]) -> None:
286
+ """Extract BTreeMap entries from a schema field (record-batch path)."""
287
+ full_name = f"{parent}{DELIMITER}{field.name}" if parent else field.name
288
+ _extract_type_entries(field.type, field.nullable, full_name, out)
289
+
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # Type normalization (spec Section 2.1 — type canonicalization)
293
+ # ---------------------------------------------------------------------------
294
+
295
+ def _normalize_data_type(dt: pa.DataType) -> pa.DataType:
296
+ """Recursively normalize a DataType to its canonical large equivalent."""
297
+ import pyarrow as pa
298
+
299
+ if pa.types.is_dictionary(dt):
300
+ return _normalize_data_type(dt.value_type)
301
+ if dt == pa.utf8():
302
+ return pa.large_utf8()
303
+ if dt == pa.binary():
304
+ return pa.large_binary()
305
+ if pa.types.is_list(dt) or pa.types.is_large_list(dt):
306
+ inner = _normalize_field(dt.value_field)
307
+ return pa.large_list(inner)
308
+ if pa.types.is_struct(dt):
309
+ fields = [_normalize_field(dt.field(i)) for i in range(dt.num_fields)]
310
+ return pa.struct(fields)
311
+ if pa.types.is_fixed_size_list(dt):
312
+ inner = _normalize_field(dt.value_field)
313
+ return pa.list_(inner, dt.list_size)
314
+ if pa.types.is_map(dt):
315
+ key_field = _normalize_field(dt.key_field)
316
+ item_field = _normalize_field(dt.item_field)
317
+ return pa.map_(key_field.type, item_field.type, keys_sorted=dt.keys_sorted)
318
+ return dt
319
+
320
+
321
+ def _normalize_field(field: pa.Field) -> pa.Field:
322
+ """Normalize a single field: keep name and nullability, normalize the data type."""
323
+ import pyarrow as pa
324
+ return pa.field(field.name, _normalize_data_type(field.type), field.nullable)
325
+
326
+
327
+ # ---------------------------------------------------------------------------
328
+ # Array normalization helper
329
+ # ---------------------------------------------------------------------------
330
+
331
+ def _normalize_array(data_type, array):
332
+ """Normalize small Arrow variants to their large canonical equivalents.
333
+
334
+ Returns (effective_data_type, effective_array).
335
+ """
336
+ import pyarrow as pa
337
+
338
+ if pa.types.is_string(data_type) and not pa.types.is_large_string(data_type):
339
+ return pa.large_utf8(), array.cast(pa.large_utf8())
340
+ if pa.types.is_binary(data_type) and not pa.types.is_large_binary(data_type):
341
+ return pa.large_binary(), array.cast(pa.large_binary())
342
+ if pa.types.is_list(data_type) and not pa.types.is_large_list(data_type):
343
+ target = pa.large_list(data_type.value_field)
344
+ return target, array.cast(target)
345
+ if pa.types.is_dictionary(data_type):
346
+ effective_type = data_type.value_type
347
+ return _normalize_array(effective_type, array.cast(effective_type))
348
+ return data_type, array
349
+
350
+
351
+ # ---------------------------------------------------------------------------
352
+ # Recursive traversal — populates BTreeMap entries from array data
353
+ # (spec Sections 3.1–3.5)
354
+ # ---------------------------------------------------------------------------
355
+
356
+ def _combine_null_masks(own_valid, ancestor_valid):
357
+ """AND-combine two validity lists. Returns None if all valid."""
358
+ if own_valid is None and ancestor_valid is None:
359
+ return None
360
+ if own_valid is None:
361
+ return ancestor_valid
362
+ if ancestor_valid is None:
363
+ return own_valid
364
+ return [a and b for a, b in zip(own_valid, ancestor_valid)]
365
+
366
+
367
+ def _get_validity_list(array):
368
+ """Return a list of bools (True=valid) or None if no nulls."""
369
+ if array.null_count == 0:
370
+ return None
371
+ return [array[i].is_valid for i in range(len(array))]
372
+
373
+
374
+ def _traverse_and_update(data_type, nullable, array, path, ancestor_nulls, fields):
375
+ """Top-down recursive traversal that routes data to BTreeMap entries.
376
+
377
+ Parameters:
378
+ data_type: Arrow data type of the array
379
+ nullable: whether this position is nullable
380
+ array: the Arrow array to hash
381
+ path: current BTreeMap key path
382
+ ancestor_nulls: list of bools from ancestor struct nulls, or None
383
+ fields: the BTreeMap of entries to populate
384
+ """
385
+ import pyarrow as pa
386
+
387
+ effective_type, effective_array = _normalize_array(data_type, array)
388
+ canonical = _normalize_data_type(effective_type)
389
+
390
+ if pa.types.is_large_list(canonical):
391
+ _traverse_list(effective_array, canonical.value_field, nullable, path, ancestor_nulls, fields)
392
+ elif pa.types.is_struct(canonical):
393
+ _traverse_struct(effective_array, nullable, path, ancestor_nulls, fields)
394
+ else:
395
+ _traverse_leaf(effective_type, effective_array, path, ancestor_nulls, fields)
396
+
397
+
398
+ def _traverse_list(array, value_field, nullable, path, ancestor_nulls, fields):
399
+ """Traverse a list array, populating validity/structural/data entries (spec Section 3.4)."""
400
+ import pyarrow as pa
401
+
402
+ # If nullable, record list-level validity at `path`
403
+ if nullable:
404
+ entry = fields.get(path)
405
+ if entry is not None:
406
+ null_bits = entry[0]
407
+ if null_bits is not None:
408
+ own_valid = _get_validity_list(array)
409
+ effective = _combine_null_masks(own_valid, ancestor_nulls)
410
+ if effective is not None:
411
+ for v in effective:
412
+ null_bits.push(v)
413
+ else:
414
+ null_bits.extend_true(len(array))
415
+
416
+ list_path = f"{path}{DELIMITER}"
417
+
418
+ # Determine effective null buffer for skipping null list elements
419
+ own_valid = _get_validity_list(array)
420
+ effective_nulls = _combine_null_masks(own_valid, ancestor_nulls)
421
+
422
+ # For each row, write structural info and recurse into non-null elements
423
+ offsets = array.offsets
424
+ for i in range(len(array)):
425
+ is_valid = effective_nulls is None or effective_nulls[i]
426
+ if is_valid:
427
+ start = offsets[i].as_py()
428
+ end = offsets[i + 1].as_py()
429
+ sub_array = array.values.slice(start, end - start)
430
+ sub_len = len(sub_array)
431
+
432
+ # Write list length to structural digest at list_path
433
+ entry = fields.get(list_path)
434
+ if entry is not None and entry[1] is not None:
435
+ entry[1].update(struct.pack("<Q", sub_len))
436
+
437
+ # Recurse into the sub-array using the value field's type
438
+ _traverse_and_update(
439
+ value_field.type,
440
+ value_field.nullable,
441
+ sub_array,
442
+ list_path,
443
+ None, # list elements don't inherit ancestor struct nulls
444
+ fields,
445
+ )
446
+
447
+
448
+ def _traverse_struct(array, nullable, path, ancestor_nulls, fields):
449
+ """Traverse a struct array — struct is transparent (spec Section 3.5).
450
+
451
+ Struct-level nulls are AND-propagated to all descendant entries.
452
+ """
453
+ import pyarrow as pa
454
+
455
+ struct_array = array
456
+ # Combine struct's own nulls with ancestor nulls (AND propagation)
457
+ if nullable:
458
+ combined = _combine_null_masks(_get_validity_list(struct_array), ancestor_nulls)
459
+ else:
460
+ combined = ancestor_nulls
461
+
462
+ # Sort children alphabetically by field name
463
+ children = [(i, struct_array.type.field(i)) for i in range(struct_array.type.num_fields)]
464
+ children.sort(key=lambda x: x[1].name)
465
+
466
+ for idx, child_field in children:
467
+ child_array = struct_array.field(idx)
468
+ child_path = f"{path}{DELIMITER}{child_field.name}" if path else child_field.name
469
+ _traverse_and_update(
470
+ child_field.type,
471
+ child_field.nullable,
472
+ child_array,
473
+ child_path,
474
+ combined,
475
+ fields,
476
+ )
477
+
478
+
479
+ def _traverse_leaf(data_type, array, path, ancestor_nulls, fields):
480
+ """Traverse a leaf array — hash data into its BTreeMap entry (spec Sections 3.1–3.3)."""
481
+ entry = fields.get(path)
482
+ if entry is None:
483
+ return
484
+
485
+ null_bits, _structural, data_digest = entry
486
+ if data_digest is None:
487
+ return
488
+
489
+ # Compute effective validity (own nulls AND ancestor struct nulls)
490
+ own_valid = _get_validity_list(array)
491
+ effective = _combine_null_masks(own_valid, ancestor_nulls)
492
+
493
+ # Push effective validity to null_bits
494
+ if null_bits is not None:
495
+ if effective is not None:
496
+ for v in effective:
497
+ null_bits.push(v)
498
+ else:
499
+ null_bits.extend_true(len(array))
500
+
501
+ # Hash leaf data, skipping null elements
502
+ _hash_leaf_data(data_type, array, data_digest, effective)
503
+
504
+
505
+ def _hash_leaf_data(data_type, array, data_digest, effective_nulls):
506
+ """Hash leaf-level data bytes into the data digest (spec Sections 3.1–3.3)."""
507
+ import pyarrow as pa
508
+
509
+ if pa.types.is_boolean(data_type):
510
+ _hash_boolean_data(array, data_digest, effective_nulls)
511
+ elif pa.types.is_large_binary(data_type):
512
+ _hash_binary_data(array, data_digest, effective_nulls)
513
+ elif pa.types.is_large_string(data_type):
514
+ _hash_string_data(array, data_digest, effective_nulls)
515
+ else:
516
+ element_size = _element_size_for_type(data_type)
517
+ if element_size is not None:
518
+ _hash_fixed_size_data(array, data_digest, element_size, effective_nulls)
519
+ else:
520
+ raise NotImplementedError(f"Unsupported leaf type: {data_type}")
521
+
522
+
523
+ # ---------------------------------------------------------------------------
524
+ # Leaf data hashing (spec Sections 3.1–3.3)
525
+ # ---------------------------------------------------------------------------
526
+
527
+ def _element_size_for_type(dt: pa.DataType) -> int | None:
528
+ """Return byte width for fixed-size types, or None for variable-length."""
529
+ import pyarrow as pa
530
+
531
+ _sizes = {
532
+ pa.int8(): 1, pa.uint8(): 1,
533
+ pa.int16(): 2, pa.uint16(): 2, pa.float16(): 2,
534
+ pa.int32(): 4, pa.uint32(): 4, pa.float32(): 4, pa.date32(): 4,
535
+ pa.int64(): 8, pa.uint64(): 8, pa.float64(): 8, pa.date64(): 8,
536
+ }
537
+ if dt in _sizes:
538
+ return _sizes[dt]
539
+ if pa.types.is_time32(dt):
540
+ return 4
541
+ if pa.types.is_time64(dt):
542
+ return 8
543
+ if pa.types.is_decimal(dt):
544
+ return dt.bit_width // 8
545
+ if pa.types.is_fixed_size_binary(dt):
546
+ return dt.byte_width
547
+ return None
548
+
549
+
550
+ def _hash_fixed_size_data(arr, data_digest, element_size: int, effective_nulls) -> None:
551
+ """Hash a fixed-width array's data bytes (spec Section 3.1)."""
552
+ bufs = arr.buffers()
553
+ data_buf = bufs[1]
554
+ offset = arr.offset
555
+
556
+ raw = data_buf.to_pybytes()
557
+ start = offset * element_size
558
+
559
+ if effective_nulls is None:
560
+ # Non-nullable or all valid: feed entire contiguous buffer
561
+ end = start + len(arr) * element_size
562
+ data_digest.update(raw[start:end])
563
+ else:
564
+ # Nullable: feed only valid elements
565
+ has_nulls = any(not v for v in effective_nulls)
566
+ if has_nulls:
567
+ for i in range(len(arr)):
568
+ if effective_nulls[i]:
569
+ pos = start + i * element_size
570
+ data_digest.update(raw[pos:pos + element_size])
571
+ else:
572
+ end = start + len(arr) * element_size
573
+ data_digest.update(raw[start:end])
574
+
575
+
576
+ def _hash_boolean_data(arr, data_digest, effective_nulls) -> None:
577
+ """Hash boolean array data bits (spec Section 3.2)."""
578
+ bv = _BitVec()
579
+ if effective_nulls is None:
580
+ for i in range(len(arr)):
581
+ bv.push(arr[i].as_py())
582
+ else:
583
+ for i in range(len(arr)):
584
+ if effective_nulls[i]:
585
+ bv.push(arr[i].as_py())
586
+ data_digest.update(bv.raw_bytes())
587
+
588
+
589
+ def _hash_binary_data(arr, data_digest, effective_nulls) -> None:
590
+ """Hash binary array data (spec Section 3.3)."""
591
+ if effective_nulls is None:
592
+ for i in range(len(arr)):
593
+ val = arr[i].as_py()
594
+ data_digest.update(struct.pack("<Q", len(val)))
595
+ data_digest.update(val)
596
+ else:
597
+ for i in range(len(arr)):
598
+ if effective_nulls[i]:
599
+ val = arr[i].as_py()
600
+ data_digest.update(struct.pack("<Q", len(val)))
601
+ data_digest.update(val)
602
+
603
+
604
+ def _hash_string_data(arr, data_digest, effective_nulls) -> None:
605
+ """Hash string array data (spec Section 3.3)."""
606
+ if effective_nulls is None:
607
+ for i in range(len(arr)):
608
+ val = arr[i].as_py().encode("utf-8")
609
+ data_digest.update(struct.pack("<Q", len(val)))
610
+ data_digest.update(val)
611
+ else:
612
+ for i in range(len(arr)):
613
+ if effective_nulls[i]:
614
+ val = arr[i].as_py().encode("utf-8")
615
+ data_digest.update(struct.pack("<Q", len(val)))
616
+ data_digest.update(val)
617
+
618
+
619
+ # ---------------------------------------------------------------------------
620
+ # Finalization (spec Section 4)
621
+ # ---------------------------------------------------------------------------
622
+
623
+ def _finalize_digest(final_digest, entry: tuple) -> None:
624
+ """Finalize a single BTreeMap entry into the final combining digest (spec Section 4)."""
625
+ null_bits, structural, data = entry
626
+
627
+ # 1. null_bits (if present — nullable entries only)
628
+ if null_bits is not None:
629
+ final_digest.update(struct.pack("<Q", len(null_bits)))
630
+ for b in null_bits.raw_bytes():
631
+ final_digest.update(bytes([b]))
632
+
633
+ # 2. structural (if present — list entries only)
634
+ if structural is not None:
635
+ final_digest.update(structural.digest())
636
+
637
+ # 3. data (if present — leaf and list-leaf entries only)
638
+ if data is not None:
639
+ final_digest.update(data.digest())
640
+
641
+
642
+ # ---------------------------------------------------------------------------
643
+ # Public API
644
+ # ---------------------------------------------------------------------------
645
+
646
+ class ArrowDigester:
647
+ """Pure-Python equivalent of the Rust ``ArrowDigester``.
648
+
649
+ Produces identical SHA-256 hashes with a 3-byte version prefix.
650
+ """
651
+
652
+ def __init__(self, schema: pa.Schema) -> None:
653
+ self._schema = schema
654
+ self._schema_digest = _hash_schema(schema)
655
+ # BTreeMap<path, (BitVec|None, sha256|None, sha256|None)> — sorted by key
656
+ self._fields: dict[str, tuple] = {}
657
+ for i in range(len(schema)):
658
+ _extract_fields(schema.field(i), "", self._fields)
659
+ # Ensure sorted order (Python 3.7+ dicts are insertion-ordered)
660
+ self._fields = dict(sorted(self._fields.items()))
661
+
662
+ def update(self, record_batch: pa.RecordBatch) -> None:
663
+ """Feed a RecordBatch into the running digest (spec Sections 3–5)."""
664
+ # Build a mapping from top-level column name to (field, array)
665
+ schema = record_batch.schema
666
+ # Traverse each top-level field using the recursive traversal
667
+ for i in range(len(schema)):
668
+ field = schema.field(i)
669
+ col = record_batch.column(i)
670
+ _traverse_and_update(
671
+ field.type,
672
+ field.nullable,
673
+ col,
674
+ field.name,
675
+ None, # no ancestor struct nulls at top level
676
+ self._fields,
677
+ )
678
+
679
+ def finalize(self) -> bytes:
680
+ """Consume the digester and return the versioned hash (spec Section 5)."""
681
+ final_digest = hashlib.sha256()
682
+ final_digest.update(self._schema_digest)
683
+ for _path, entry in sorted(self._fields.items()):
684
+ _finalize_digest(final_digest, entry)
685
+ return VERSION_BYTES + final_digest.digest()
686
+
687
+ # -- Convenience class methods ------------------------------------------
688
+
689
+ @staticmethod
690
+ def hash_schema(schema: pa.Schema) -> bytes:
691
+ return VERSION_BYTES + _hash_schema(schema)
692
+
693
+ @staticmethod
694
+ def hash_record_batch(record_batch: pa.RecordBatch) -> bytes:
695
+ d = ArrowDigester(record_batch.schema)
696
+ d.update(record_batch)
697
+ return d.finalize()
698
+
699
+ @staticmethod
700
+ def hash_table(table: pa.Table) -> bytes:
701
+ """Hash a full table (iterates over all batches)."""
702
+ d = ArrowDigester(table.schema)
703
+ for batch in table.to_batches():
704
+ d.update(batch)
705
+ return d.finalize()
706
+
707
+ @staticmethod
708
+ def hash_array(array: pa.Array) -> bytes:
709
+ """Hash a single array (spec Section 6).
710
+
711
+ Uses the same recursive BTreeMap decomposition as the record-batch path.
712
+ """
713
+ import pyarrow as pa
714
+
715
+ # Resolve dictionary arrays to their plain value type
716
+ effective_type = array.type
717
+ effective_array = array
718
+ if pa.types.is_dictionary(effective_type):
719
+ effective_type = effective_type.value_type
720
+ effective_array = array.cast(effective_type)
721
+
722
+ # Normalize to canonical large types
723
+ normalized_type = _normalize_data_type(effective_type)
724
+
725
+ # Step 1: Type metadata (canonical JSON string)
726
+ dt_value = _data_type_to_value(normalized_type)
727
+ dt_value = _sort_json_value(dt_value)
728
+ dt_json = json.dumps(dt_value, separators=(",", ":"))
729
+
730
+ final_digest = hashlib.sha256()
731
+ final_digest.update(dt_json.encode())
732
+
733
+ # Determine nullability: arrays with null_count > 0 are nullable
734
+ nullable = effective_array.null_count > 0
735
+
736
+ # Step 2: Build BTreeMap entries from the type tree (same as record-batch)
737
+ fields: dict[str, tuple] = {}
738
+ _extract_type_entries(effective_type, nullable, "", fields)
739
+ fields = dict(sorted(fields.items()))
740
+
741
+ # Step 3: Traverse and populate entries
742
+ _traverse_and_update(
743
+ effective_type,
744
+ nullable,
745
+ effective_array,
746
+ "",
747
+ None,
748
+ fields,
749
+ )
750
+
751
+ # Step 4: Finalize all entries into the digest
752
+ for _path, entry in sorted(fields.items()):
753
+ _finalize_digest(final_digest, entry)
754
+
755
+ return VERSION_BYTES + final_digest.digest()
starfix/py.typed ADDED
File without changes
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: starfix
3
+ Version: 0.1.0
4
+ Summary: Pure-Python implementation of starfix Arrow logical hasher
5
+ Author: nauticalab
6
+ License-Expression: MIT OR Apache-2.0
7
+ Keywords: arrow,hashing
8
+ Classifier: Programming Language :: Python :: 3
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: pyarrow>=14.0.0
11
+ Description-Content-Type: text/markdown
12
+
13
+ # starfix-python
14
+
15
+ Pure-Python implementation of the [starfix](https://github.com/nauticalab/starfix) Arrow logical hasher.
16
+
17
+ Produces stable SHA-256 hashes of Arrow tables, record batches, and arrays that are:
18
+
19
+ - **Column-order independent** — reordering columns does not change the hash
20
+ - **Batch-split independent** — splitting data across batches does not change the hash
21
+ - **Cross-language compatible** — identical hashes to the Rust implementation
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ pip install starfix
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ import pyarrow as pa
33
+ from starfix import ArrowDigester
34
+
35
+ schema = pa.schema([
36
+ pa.field("id", pa.int32(), nullable=False),
37
+ pa.field("value", pa.float64(), nullable=True),
38
+ ])
39
+
40
+ # Hash a full table
41
+ table = pa.table({"id": [1, 2, 3], "value": [1.1, 2.2, 3.3]}, schema=schema)
42
+ digest = ArrowDigester.hash_table(table)
43
+
44
+ # Streaming: feed record batches incrementally
45
+ digester = ArrowDigester(schema)
46
+ for batch in batches:
47
+ digester.update(batch)
48
+ digest = digester.finalize()
49
+ ```
50
+
51
+ ## License
52
+
53
+ MIT OR Apache-2.0
@@ -0,0 +1,7 @@
1
+ starfix/__init__.py,sha256=UVQ-QdCuc3rRT4GVmu2qcDOooXw7Gy4xIlrgwArD3Jo,78
2
+ starfix/_version.py,sha256=n_5vdJsPNu7wZ57LGuRL585uvll-hiuvZUBWzdG0RQU,520
3
+ starfix/arrow_digester.py,sha256=5iJsRg8pshnbxgwjQbYzjSQ2qd-Y7Or_qwMGS2WGmB0,28179
4
+ starfix/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ starfix-0.1.0.dist-info/METADATA,sha256=2nqMhI6yz1hm5HuvqSwjPzHmhJAYpxjNU8uN4r_-7-A,1386
6
+ starfix-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
7
+ starfix-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any