integer-atlas-algos 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integer_atlas_algos/__init__.py +3 -0
- integer_atlas_algos/_lib/__init__.py +0 -0
- integer_atlas_algos/_lib/blake3_py.py +172 -0
- integer_atlas_algos/_lib/factorization.py +87 -0
- integer_atlas_algos/_lib/multiplicative.py +9 -0
- integer_atlas_algos/context.py +28 -0
- integer_atlas_algos/executor/__init__.py +0 -0
- integer_atlas_algos/executor/__main__.py +6 -0
- integer_atlas_algos/executor/atomicio.py +91 -0
- integer_atlas_algos/executor/backends/__init__.py +24 -0
- integer_atlas_algos/executor/backends/csv_backend.py +67 -0
- integer_atlas_algos/executor/backends/parquet_backend.py +87 -0
- integer_atlas_algos/executor/cli.py +147 -0
- integer_atlas_algos/executor/compute.py +251 -0
- integer_atlas_algos/executor/estimate.py +58 -0
- integer_atlas_algos/executor/manifest.py +87 -0
- integer_atlas_algos/executor/verify.py +77 -0
- integer_atlas_algos/precomputed/primes_le_31623.txt +3401 -0
- integer_atlas_algos/properties/__init__.py +12 -0
- integer_atlas_algos/properties/abs_n.py +7 -0
- integer_atlas_algos/properties/abundance_class.py +13 -0
- integer_atlas_algos/properties/abundancy_index.py +12 -0
- integer_atlas_algos/properties/aliquot_sum.py +12 -0
- integer_atlas_algos/properties/binary_popcount.py +7 -0
- integer_atlas_algos/properties/binary_repr.py +7 -0
- integer_atlas_algos/properties/bit_length.py +7 -0
- integer_atlas_algos/properties/carmichael_lambda.py +19 -0
- integer_atlas_algos/properties/collatz_stopping_time.py +15 -0
- integer_atlas_algos/properties/decimal_digit_count.py +7 -0
- integer_atlas_algos/properties/digit_sum.py +7 -0
- integer_atlas_algos/properties/digital_root.py +8 -0
- integer_atlas_algos/properties/divisor_count.py +13 -0
- integer_atlas_algos/properties/divisor_sum.py +11 -0
- integer_atlas_algos/properties/dyadic_valuation.py +11 -0
- integer_atlas_algos/properties/euler_phi.py +14 -0
- integer_atlas_algos/properties/gcd_sum_pillai.py +17 -0
- integer_atlas_algos/properties/hex_repr.py +7 -0
- integer_atlas_algos/properties/integer_sqrt.py +9 -0
- integer_atlas_algos/properties/is_even.py +7 -0
- integer_atlas_algos/properties/is_fibonacci.py +15 -0
- integer_atlas_algos/properties/is_happy.py +14 -0
- integer_atlas_algos/properties/is_harshad.py +10 -0
- integer_atlas_algos/properties/is_odd.py +7 -0
- integer_atlas_algos/properties/is_palindrome.py +8 -0
- integer_atlas_algos/properties/is_perfect.py +12 -0
- integer_atlas_algos/properties/is_perfect_power.py +17 -0
- integer_atlas_algos/properties/is_powerful.py +10 -0
- integer_atlas_algos/properties/is_practical.py +18 -0
- integer_atlas_algos/properties/is_prime.py +9 -0
- integer_atlas_algos/properties/is_prime_power.py +8 -0
- integer_atlas_algos/properties/is_square.py +10 -0
- integer_atlas_algos/properties/is_squarefree.py +10 -0
- integer_atlas_algos/properties/is_triangular.py +11 -0
- integer_atlas_algos/properties/largest_prime_factor.py +11 -0
- integer_atlas_algos/properties/liouville_lambda.py +8 -0
- integer_atlas_algos/properties/mobius.py +13 -0
- integer_atlas_algos/properties/octal_repr.py +7 -0
- integer_atlas_algos/properties/omega_big.py +8 -0
- integer_atlas_algos/properties/omega_distinct.py +8 -0
- integer_atlas_algos/properties/partition_count.py +33 -0
- integer_atlas_algos/properties/radical.py +12 -0
- integer_atlas_algos/properties/sign.py +7 -0
- integer_atlas_algos/properties/smallest_prime_factor.py +11 -0
- integer_atlas_algos/properties/sum_of_two_squares_count.py +18 -0
- integer_atlas_algos/properties/von_mangoldt.py +14 -0
- integer_atlas_algos/registry.py +53 -0
- integer_atlas_algos-0.1.0.dist-info/METADATA +117 -0
- integer_atlas_algos-0.1.0.dist-info/RECORD +70 -0
- integer_atlas_algos-0.1.0.dist-info/WHEEL +4 -0
- integer_atlas_algos-0.1.0.dist-info/entry_points.txt +2 -0
|
File without changes
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Pure-Python BLAKE3 (hash mode), following the official reference design.
|
|
2
|
+
|
|
3
|
+
This is the dependency-free fallback used when the fast `blake3` package is not
|
|
4
|
+
installed, so a shard's blake3 hash is always populated. It is correct but slow;
|
|
5
|
+
the executor prefers the native library when available. Validated against the
|
|
6
|
+
official BLAKE3 test vectors (see tests/test_blake3.py).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
IV = [0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
|
10
|
+
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19]
|
|
11
|
+
|
|
12
|
+
MSG_PERMUTATION = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]
|
|
13
|
+
|
|
14
|
+
CHUNK_START = 1 << 0
|
|
15
|
+
CHUNK_END = 1 << 1
|
|
16
|
+
PARENT = 1 << 2
|
|
17
|
+
ROOT = 1 << 3
|
|
18
|
+
|
|
19
|
+
BLOCK_LEN = 64
|
|
20
|
+
CHUNK_LEN = 1024
|
|
21
|
+
MASK = 0xFFFFFFFF
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _rotr(x, n):
|
|
25
|
+
return ((x >> n) | (x << (32 - n))) & MASK
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _g(s, a, b, c, d, mx, my):
|
|
29
|
+
s[a] = (s[a] + s[b] + mx) & MASK
|
|
30
|
+
s[d] = _rotr(s[d] ^ s[a], 16)
|
|
31
|
+
s[c] = (s[c] + s[d]) & MASK
|
|
32
|
+
s[b] = _rotr(s[b] ^ s[c], 12)
|
|
33
|
+
s[a] = (s[a] + s[b] + my) & MASK
|
|
34
|
+
s[d] = _rotr(s[d] ^ s[a], 8)
|
|
35
|
+
s[c] = (s[c] + s[d]) & MASK
|
|
36
|
+
s[b] = _rotr(s[b] ^ s[c], 7)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _round(s, m):
|
|
40
|
+
_g(s, 0, 4, 8, 12, m[0], m[1])
|
|
41
|
+
_g(s, 1, 5, 9, 13, m[2], m[3])
|
|
42
|
+
_g(s, 2, 6, 10, 14, m[4], m[5])
|
|
43
|
+
_g(s, 3, 7, 11, 15, m[6], m[7])
|
|
44
|
+
_g(s, 0, 5, 10, 15, m[8], m[9])
|
|
45
|
+
_g(s, 1, 6, 11, 12, m[10], m[11])
|
|
46
|
+
_g(s, 2, 7, 8, 13, m[12], m[13])
|
|
47
|
+
_g(s, 3, 4, 9, 14, m[14], m[15])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _compress(cv, block_words, counter, block_len, flags):
|
|
51
|
+
s = [
|
|
52
|
+
cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
|
|
53
|
+
IV[0], IV[1], IV[2], IV[3],
|
|
54
|
+
counter & MASK, (counter >> 32) & MASK, block_len, flags,
|
|
55
|
+
]
|
|
56
|
+
m = list(block_words)
|
|
57
|
+
for r in range(7):
|
|
58
|
+
_round(s, m)
|
|
59
|
+
if r < 6:
|
|
60
|
+
m = [m[MSG_PERMUTATION[i]] for i in range(16)]
|
|
61
|
+
for i in range(8):
|
|
62
|
+
s[i] ^= s[i + 8]
|
|
63
|
+
s[i + 8] ^= cv[i]
|
|
64
|
+
return s
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _words(block64):
|
|
68
|
+
return [int.from_bytes(block64[i:i + 4], "little") for i in range(0, 64, 4)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class _Output:
|
|
72
|
+
__slots__ = ("cv", "block_words", "counter", "block_len", "flags")
|
|
73
|
+
|
|
74
|
+
def __init__(self, cv, block_words, counter, block_len, flags):
|
|
75
|
+
self.cv = cv
|
|
76
|
+
self.block_words = block_words
|
|
77
|
+
self.counter = counter
|
|
78
|
+
self.block_len = block_len
|
|
79
|
+
self.flags = flags
|
|
80
|
+
|
|
81
|
+
def chaining_value(self):
|
|
82
|
+
return _compress(self.cv, self.block_words, self.counter,
|
|
83
|
+
self.block_len, self.flags)[:8]
|
|
84
|
+
|
|
85
|
+
def root_bytes(self, length):
|
|
86
|
+
out = bytearray()
|
|
87
|
+
counter = 0
|
|
88
|
+
while length > 0:
|
|
89
|
+
words = _compress(self.cv, self.block_words, counter,
|
|
90
|
+
self.block_len, self.flags | ROOT)
|
|
91
|
+
block = b"".join(w.to_bytes(4, "little") for w in words)
|
|
92
|
+
take = min(len(block), length)
|
|
93
|
+
out += block[:take]
|
|
94
|
+
length -= take
|
|
95
|
+
counter += 1
|
|
96
|
+
return bytes(out)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class _ChunkState:
|
|
100
|
+
def __init__(self, counter):
|
|
101
|
+
self.cv = list(IV)
|
|
102
|
+
self.counter = counter
|
|
103
|
+
self.block = bytearray(BLOCK_LEN)
|
|
104
|
+
self.block_len = 0
|
|
105
|
+
self.blocks_compressed = 0
|
|
106
|
+
|
|
107
|
+
def length(self):
|
|
108
|
+
return BLOCK_LEN * self.blocks_compressed + self.block_len
|
|
109
|
+
|
|
110
|
+
def _start_flag(self):
|
|
111
|
+
return CHUNK_START if self.blocks_compressed == 0 else 0
|
|
112
|
+
|
|
113
|
+
def update(self, mv):
|
|
114
|
+
pos, n = 0, len(mv)
|
|
115
|
+
while pos < n:
|
|
116
|
+
if self.block_len == BLOCK_LEN:
|
|
117
|
+
self.cv = _compress(self.cv, _words(self.block), self.counter,
|
|
118
|
+
BLOCK_LEN, self._start_flag())[:8]
|
|
119
|
+
self.blocks_compressed += 1
|
|
120
|
+
self.block_len = 0
|
|
121
|
+
take = min(BLOCK_LEN - self.block_len, n - pos)
|
|
122
|
+
self.block[self.block_len:self.block_len + take] = mv[pos:pos + take]
|
|
123
|
+
self.block_len += take
|
|
124
|
+
pos += take
|
|
125
|
+
|
|
126
|
+
def output(self):
|
|
127
|
+
padded = bytes(self.block[:self.block_len]) + b"\x00" * (BLOCK_LEN - self.block_len)
|
|
128
|
+
flags = self._start_flag() | CHUNK_END
|
|
129
|
+
return _Output(self.cv, _words(padded), self.counter, self.block_len, flags)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _parent_output(left_cv, right_cv):
|
|
133
|
+
return _Output(list(IV), left_cv + right_cv, 0, BLOCK_LEN, PARENT)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class Blake3:
|
|
137
|
+
def __init__(self):
|
|
138
|
+
self._chunk = _ChunkState(0)
|
|
139
|
+
self._stack = []
|
|
140
|
+
|
|
141
|
+
def _add_chunk_cv(self, new_cv, total_chunks):
|
|
142
|
+
while total_chunks & 1 == 0:
|
|
143
|
+
new_cv = _parent_output(self._stack.pop(), new_cv).chaining_value()
|
|
144
|
+
total_chunks >>= 1
|
|
145
|
+
self._stack.append(new_cv)
|
|
146
|
+
|
|
147
|
+
def update(self, data):
|
|
148
|
+
mv = memoryview(data)
|
|
149
|
+
pos, n = 0, len(mv)
|
|
150
|
+
while pos < n:
|
|
151
|
+
if self._chunk.length() == CHUNK_LEN:
|
|
152
|
+
cv = self._chunk.output().chaining_value()
|
|
153
|
+
total = self._chunk.counter + 1
|
|
154
|
+
self._add_chunk_cv(cv, total)
|
|
155
|
+
self._chunk = _ChunkState(total)
|
|
156
|
+
take = min(CHUNK_LEN - self._chunk.length(), n - pos)
|
|
157
|
+
self._chunk.update(mv[pos:pos + take])
|
|
158
|
+
pos += take
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
def digest(self, length=32):
|
|
162
|
+
output = self._chunk.output()
|
|
163
|
+
for cv in reversed(self._stack):
|
|
164
|
+
output = _parent_output(cv, output.chaining_value())
|
|
165
|
+
return output.root_bytes(length)
|
|
166
|
+
|
|
167
|
+
def hexdigest(self, length=32):
|
|
168
|
+
return self.digest(length).hex()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def blake3_hexdigest(data, length=32):
|
|
172
|
+
return Blake3().update(data).hexdigest(length)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Factorization backed by a precomputed base-primes table.
|
|
2
|
+
|
|
3
|
+
To factor any n up to BOUND**2 you only need primes up to BOUND. With
|
|
4
|
+
BOUND = 31623 (>= sqrt(1e9)) this covers the whole 0..1e9 range using ~3401
|
|
5
|
+
primes. The table is a deterministic, regenerable resource cached in
|
|
6
|
+
algos/precomputed/ — not state about any shard or pack. For n beyond BOUND**2
|
|
7
|
+
factorization stays correct by continuing trial division past the table.
|
|
8
|
+
"""
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
_PRECOMP_DIR = os.path.join(
|
|
12
|
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "precomputed")
|
|
13
|
+
DEFAULT_BOUND = 31623 # ceil(sqrt(1e9)); factors any n <= 1e9
|
|
14
|
+
|
|
15
|
+
_primes_cache: dict[int, list[int]] = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _sieve(bound: int) -> list[int]:
|
|
19
|
+
flags = bytearray([1]) * (bound + 1)
|
|
20
|
+
flags[0] = flags[1] = 0
|
|
21
|
+
i = 2
|
|
22
|
+
while i * i <= bound:
|
|
23
|
+
if flags[i]:
|
|
24
|
+
flags[i * i::i] = bytearray(len(flags[i * i::i]))
|
|
25
|
+
i += 1
|
|
26
|
+
return [i for i in range(2, bound + 1) if flags[i]]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def small_primes(bound: int = DEFAULT_BOUND) -> list[int]:
|
|
30
|
+
"""Primes <= bound, cached in memory and persisted under precomputed/."""
|
|
31
|
+
cached = _primes_cache.get(bound)
|
|
32
|
+
if cached is not None:
|
|
33
|
+
return cached
|
|
34
|
+
path = os.path.join(_PRECOMP_DIR, f"primes_le_{bound}.txt")
|
|
35
|
+
if os.path.exists(path):
|
|
36
|
+
with open(path) as f:
|
|
37
|
+
primes = [int(x) for x in f.read().split()]
|
|
38
|
+
else:
|
|
39
|
+
primes = _sieve(bound)
|
|
40
|
+
try:
|
|
41
|
+
os.makedirs(_PRECOMP_DIR, exist_ok=True)
|
|
42
|
+
tmp = path + ".tmp"
|
|
43
|
+
with open(tmp, "w") as f:
|
|
44
|
+
f.write("\n".join(map(str, primes)))
|
|
45
|
+
os.replace(tmp, path)
|
|
46
|
+
except OSError:
|
|
47
|
+
pass # read-only filesystem: fall back to in-memory only
|
|
48
|
+
_primes_cache[bound] = primes
|
|
49
|
+
return primes
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def factorize(m: int) -> dict[int, int]:
|
|
53
|
+
"""prime -> exponent for abs(m); empty dict for 0 and 1."""
|
|
54
|
+
m = abs(int(m))
|
|
55
|
+
f: dict[int, int] = {}
|
|
56
|
+
if m < 2:
|
|
57
|
+
return f
|
|
58
|
+
primes = small_primes()
|
|
59
|
+
broke = False
|
|
60
|
+
for p in primes:
|
|
61
|
+
if p * p > m:
|
|
62
|
+
broke = True
|
|
63
|
+
break
|
|
64
|
+
if m % p == 0:
|
|
65
|
+
c = 0
|
|
66
|
+
while m % p == 0:
|
|
67
|
+
m //= p
|
|
68
|
+
c += 1
|
|
69
|
+
f[p] = c
|
|
70
|
+
if m > 1:
|
|
71
|
+
if broke or m <= primes[-1] * primes[-1]:
|
|
72
|
+
# remaining cofactor is prime (it has no factor <= sqrt(m))
|
|
73
|
+
f[m] = f.get(m, 0) + 1
|
|
74
|
+
else:
|
|
75
|
+
# n exceeded the table's reach: keep trial-dividing past the bound
|
|
76
|
+
d = primes[-1] + 2
|
|
77
|
+
while d * d <= m:
|
|
78
|
+
if m % d == 0:
|
|
79
|
+
c = 0
|
|
80
|
+
while m % d == 0:
|
|
81
|
+
m //= d
|
|
82
|
+
c += 1
|
|
83
|
+
f[d] = c
|
|
84
|
+
d += 2
|
|
85
|
+
if m > 1:
|
|
86
|
+
f[m] = f.get(m, 0) + 1
|
|
87
|
+
return f
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Shared multiplicative-function helpers derived from a factorization."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def sigma(factorization):
|
|
5
|
+
"""Sum of divisors from prime -> exponent (1 for the empty factorization)."""
|
|
6
|
+
s = 1
|
|
7
|
+
for p, e in factorization.items():
|
|
8
|
+
s *= (p ** (e + 1) - 1) // (p - 1)
|
|
9
|
+
return s
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Per-n memoized context passed to every method.
|
|
2
|
+
|
|
3
|
+
Shared, expensive intermediates (factorization, etc.) are computed once per n
|
|
4
|
+
and reused across methods that declare them via `requires`. Cheap methods just
|
|
5
|
+
read ctx.n / ctx.abs_n and ignore the rest.
|
|
6
|
+
"""
|
|
7
|
+
from functools import cached_property
|
|
8
|
+
|
|
9
|
+
from integer_atlas_algos._lib.factorization import factorize
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Context:
|
|
13
|
+
def __init__(self, n: int):
|
|
14
|
+
self.n = n
|
|
15
|
+
self.abs_n = abs(n)
|
|
16
|
+
|
|
17
|
+
@cached_property
|
|
18
|
+
def factorization(self) -> dict[int, int]:
|
|
19
|
+
"""prime -> exponent for abs(n); empty for 0 and 1."""
|
|
20
|
+
return factorize(self.abs_n)
|
|
21
|
+
|
|
22
|
+
@cached_property
|
|
23
|
+
def divisors(self) -> list[int]:
|
|
24
|
+
"""Sorted positive divisors of abs(n) (for abs(n) >= 1)."""
|
|
25
|
+
divs = [1]
|
|
26
|
+
for p, e in self.factorization.items():
|
|
27
|
+
divs = [d * p ** k for d in divs for k in range(e + 1)]
|
|
28
|
+
return sorted(divs)
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Crash-safe file primitives.
|
|
2
|
+
|
|
3
|
+
Every durable write goes to a temp file in the same directory, is fsync'd, then
|
|
4
|
+
atomically renamed into place (os.replace). The containing directory is fsync'd
|
|
5
|
+
so the rename itself survives a crash. A partial temp file is never visible under
|
|
6
|
+
the real name, so a reader either sees the previous version or the new one.
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import tempfile
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _fsync_file(path):
|
|
14
|
+
fd = os.open(path, os.O_RDONLY)
|
|
15
|
+
try:
|
|
16
|
+
os.fsync(fd)
|
|
17
|
+
finally:
|
|
18
|
+
os.close(fd)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _fsync_dir(d):
|
|
22
|
+
try:
|
|
23
|
+
fd = os.open(d, os.O_RDONLY)
|
|
24
|
+
try:
|
|
25
|
+
os.fsync(fd)
|
|
26
|
+
finally:
|
|
27
|
+
os.close(fd)
|
|
28
|
+
except OSError:
|
|
29
|
+
# Some platforms disallow fsync on a directory; the rename is still atomic.
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _safe_unlink(path):
|
|
34
|
+
try:
|
|
35
|
+
os.unlink(path)
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def atomic_write_bytes(path, data: bytes):
|
|
41
|
+
d = os.path.dirname(os.path.abspath(path))
|
|
42
|
+
fd, tmp = tempfile.mkstemp(dir=d, prefix=".tmp-", suffix=".part")
|
|
43
|
+
try:
|
|
44
|
+
with os.fdopen(fd, "wb") as f:
|
|
45
|
+
f.write(data)
|
|
46
|
+
f.flush()
|
|
47
|
+
os.fsync(f.fileno())
|
|
48
|
+
os.replace(tmp, path)
|
|
49
|
+
_fsync_dir(d)
|
|
50
|
+
except BaseException:
|
|
51
|
+
_safe_unlink(tmp)
|
|
52
|
+
raise
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def atomic_write_text(path, text: str):
|
|
56
|
+
atomic_write_bytes(path, text.encode("utf-8"))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def atomic_write_json(path, obj):
|
|
60
|
+
atomic_write_text(path, json.dumps(obj, indent=2, sort_keys=True))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def atomic_produce(path, writer):
|
|
64
|
+
"""Atomically create `path` via writer(tmp_path), which writes the whole file.
|
|
65
|
+
|
|
66
|
+
Used for backends (e.g. Parquet) that write their own file format.
|
|
67
|
+
"""
|
|
68
|
+
d = os.path.dirname(os.path.abspath(path))
|
|
69
|
+
fd, tmp = tempfile.mkstemp(dir=d, prefix=".tmp-", suffix=".part")
|
|
70
|
+
os.close(fd)
|
|
71
|
+
try:
|
|
72
|
+
writer(tmp)
|
|
73
|
+
_fsync_file(tmp)
|
|
74
|
+
os.replace(tmp, path)
|
|
75
|
+
_fsync_dir(d)
|
|
76
|
+
except BaseException:
|
|
77
|
+
_safe_unlink(tmp)
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def read_json(path):
|
|
82
|
+
with open(path) as f:
|
|
83
|
+
return json.load(f)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def cleanup_tmp(directory):
|
|
87
|
+
if not os.path.isdir(directory):
|
|
88
|
+
return
|
|
89
|
+
for name in os.listdir(directory):
|
|
90
|
+
if name.startswith(".tmp-"):
|
|
91
|
+
_safe_unlink(os.path.join(directory, name))
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Pluggable shard file backends.
|
|
2
|
+
|
|
3
|
+
The resume/checkpoint/atomic machinery in compute.py is format-agnostic; a
|
|
4
|
+
backend only knows how to write/read/count one file. `csv` is stdlib and always
|
|
5
|
+
available (handy for dev and tests); `parquet` is the real shard format and
|
|
6
|
+
needs pyarrow.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_backend(name):
|
|
11
|
+
if name == "csv":
|
|
12
|
+
from . import csv_backend
|
|
13
|
+
return csv_backend
|
|
14
|
+
if name == "parquet":
|
|
15
|
+
try:
|
|
16
|
+
import pyarrow # noqa: F401
|
|
17
|
+
except ImportError as e:
|
|
18
|
+
raise RuntimeError(
|
|
19
|
+
"parquet format requires pyarrow (pip install pyarrow). "
|
|
20
|
+
"For a dependency-free run use --format csv."
|
|
21
|
+
) from e
|
|
22
|
+
from . import parquet_backend
|
|
23
|
+
return parquet_backend
|
|
24
|
+
raise ValueError(f"unknown format: {name}")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Stdlib CSV backend. Deterministic output; types coerced via the schema."""
|
|
2
|
+
import csv
|
|
3
|
+
|
|
4
|
+
EXT = ".csv"
|
|
5
|
+
COMPRESSION = "none"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _fmt(v, dtype):
|
|
9
|
+
if dtype == "bool":
|
|
10
|
+
return "true" if v else "false"
|
|
11
|
+
return str(v)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse(s, dtype):
|
|
15
|
+
if dtype == "bool":
|
|
16
|
+
return s == "true"
|
|
17
|
+
if dtype in ("double", "float64"):
|
|
18
|
+
return float(s)
|
|
19
|
+
if dtype == "string":
|
|
20
|
+
return s
|
|
21
|
+
return int(s) # int*, uint*, and bigint (Python ints are unbounded)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_table(path, schema, rows):
|
|
25
|
+
names = [n for n, _ in schema]
|
|
26
|
+
with open(path, "w", newline="") as f:
|
|
27
|
+
w = csv.writer(f)
|
|
28
|
+
w.writerow(names)
|
|
29
|
+
for r in rows:
|
|
30
|
+
w.writerow([_fmt(r[n], dt) for n, dt in schema])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Writer:
|
|
34
|
+
"""Streaming writer: write parts one at a time without buffering the whole file."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, path, schema):
|
|
37
|
+
self.schema = schema
|
|
38
|
+
self.f = open(path, "w", newline="")
|
|
39
|
+
self.w = csv.writer(self.f)
|
|
40
|
+
self.w.writerow([n for n, _ in schema])
|
|
41
|
+
|
|
42
|
+
def write_table(self, rows):
|
|
43
|
+
for r in rows:
|
|
44
|
+
self.w.writerow([_fmt(r[n], dt) for n, dt in self.schema])
|
|
45
|
+
|
|
46
|
+
def close(self):
|
|
47
|
+
self.f.close()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def open_writer(path, schema):
|
|
51
|
+
return Writer(path, schema)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def read_table(path, schema):
|
|
55
|
+
dtypes = dict(schema)
|
|
56
|
+
out = []
|
|
57
|
+
with open(path, newline="") as f:
|
|
58
|
+
rd = csv.reader(f)
|
|
59
|
+
header = next(rd)
|
|
60
|
+
for row in rd:
|
|
61
|
+
out.append({n: _parse(v, dtypes.get(n, "int64")) for n, v in zip(header, row)})
|
|
62
|
+
return out
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def count_rows(path):
|
|
66
|
+
with open(path) as f:
|
|
67
|
+
return max(0, sum(1 for _ in f) - 1)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Parquet backend (pyarrow). The real shard format.
|
|
2
|
+
|
|
3
|
+
Exercised only when pyarrow is installed; the control flow it plugs into is
|
|
4
|
+
covered by the CSV-backed tests.
|
|
5
|
+
"""
|
|
6
|
+
EXT = ".parquet"
|
|
7
|
+
COMPRESSION = "zstd"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _arrow_types():
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
return {
|
|
13
|
+
"int8": pa.int8(), "int16": pa.int16(), "int32": pa.int32(), "int64": pa.int64(),
|
|
14
|
+
"uint8": pa.uint8(), "uint16": pa.uint16(), "uint32": pa.uint32(), "uint64": pa.uint64(),
|
|
15
|
+
"bool": pa.bool_(), "double": pa.float64(), "float64": pa.float64(),
|
|
16
|
+
# bigint values can exceed any fixed width, so store them as decimal strings.
|
|
17
|
+
"string": pa.string(), "bigint": pa.string(),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _rows_to_table(schema, rows):
|
|
22
|
+
import pyarrow as pa
|
|
23
|
+
|
|
24
|
+
t = _arrow_types()
|
|
25
|
+
cols = {n: [] for n, _ in schema}
|
|
26
|
+
for r in rows:
|
|
27
|
+
for n, dt in schema:
|
|
28
|
+
cols[n].append(str(r[n]) if dt == "bigint" else r[n])
|
|
29
|
+
arrays = [pa.array(cols[n], type=t[dt]) for n, dt in schema]
|
|
30
|
+
return pa.table(arrays, names=[n for n, _ in schema])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_table(path, schema, rows):
|
|
34
|
+
import pyarrow.parquet as pq
|
|
35
|
+
|
|
36
|
+
pq.write_table(_rows_to_table(schema, rows), path, compression=COMPRESSION)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Writer:
|
|
40
|
+
"""Streaming writer: each part becomes a row group; memory stays bounded."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, path, schema):
|
|
43
|
+
import pyarrow.parquet as pq
|
|
44
|
+
|
|
45
|
+
self.schema = schema
|
|
46
|
+
self._pq = pq
|
|
47
|
+
self._writer = None
|
|
48
|
+
self._path = path
|
|
49
|
+
|
|
50
|
+
def write_table(self, rows):
|
|
51
|
+
table = _rows_to_table(self.schema, rows)
|
|
52
|
+
if self._writer is None:
|
|
53
|
+
self._writer = self._pq.ParquetWriter(self._path, table.schema,
|
|
54
|
+
compression=COMPRESSION)
|
|
55
|
+
self._writer.write_table(table)
|
|
56
|
+
|
|
57
|
+
def close(self):
|
|
58
|
+
if self._writer is not None:
|
|
59
|
+
self._writer.close()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def open_writer(path, schema):
|
|
63
|
+
return Writer(path, schema)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def read_table(path, schema):
|
|
67
|
+
import pyarrow.parquet as pq
|
|
68
|
+
|
|
69
|
+
cols = pq.read_table(path).to_pydict()
|
|
70
|
+
dtypes = dict(schema)
|
|
71
|
+
n = len(next(iter(cols.values()))) if cols else 0
|
|
72
|
+
rows = []
|
|
73
|
+
for i in range(n):
|
|
74
|
+
row = {}
|
|
75
|
+
for k in cols:
|
|
76
|
+
v = cols[k][i]
|
|
77
|
+
if dtypes.get(k) == "bigint" and v is not None:
|
|
78
|
+
v = int(v) # stored as decimal string; restore exact integer
|
|
79
|
+
row[k] = v
|
|
80
|
+
rows.append(row)
|
|
81
|
+
return rows
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def count_rows(path):
|
|
85
|
+
import pyarrow.parquet as pq
|
|
86
|
+
|
|
87
|
+
return pq.ParquetFile(path).metadata.num_rows
|