integer-atlas-algos 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. integer_atlas_algos/__init__.py +3 -0
  2. integer_atlas_algos/_lib/__init__.py +0 -0
  3. integer_atlas_algos/_lib/blake3_py.py +172 -0
  4. integer_atlas_algos/_lib/factorization.py +87 -0
  5. integer_atlas_algos/_lib/multiplicative.py +9 -0
  6. integer_atlas_algos/context.py +28 -0
  7. integer_atlas_algos/executor/__init__.py +0 -0
  8. integer_atlas_algos/executor/__main__.py +6 -0
  9. integer_atlas_algos/executor/atomicio.py +91 -0
  10. integer_atlas_algos/executor/backends/__init__.py +24 -0
  11. integer_atlas_algos/executor/backends/csv_backend.py +67 -0
  12. integer_atlas_algos/executor/backends/parquet_backend.py +87 -0
  13. integer_atlas_algos/executor/cli.py +147 -0
  14. integer_atlas_algos/executor/compute.py +251 -0
  15. integer_atlas_algos/executor/estimate.py +58 -0
  16. integer_atlas_algos/executor/manifest.py +87 -0
  17. integer_atlas_algos/executor/verify.py +77 -0
  18. integer_atlas_algos/precomputed/primes_le_31623.txt +3401 -0
  19. integer_atlas_algos/properties/__init__.py +12 -0
  20. integer_atlas_algos/properties/abs_n.py +7 -0
  21. integer_atlas_algos/properties/abundance_class.py +13 -0
  22. integer_atlas_algos/properties/abundancy_index.py +12 -0
  23. integer_atlas_algos/properties/aliquot_sum.py +12 -0
  24. integer_atlas_algos/properties/binary_popcount.py +7 -0
  25. integer_atlas_algos/properties/binary_repr.py +7 -0
  26. integer_atlas_algos/properties/bit_length.py +7 -0
  27. integer_atlas_algos/properties/carmichael_lambda.py +19 -0
  28. integer_atlas_algos/properties/collatz_stopping_time.py +15 -0
  29. integer_atlas_algos/properties/decimal_digit_count.py +7 -0
  30. integer_atlas_algos/properties/digit_sum.py +7 -0
  31. integer_atlas_algos/properties/digital_root.py +8 -0
  32. integer_atlas_algos/properties/divisor_count.py +13 -0
  33. integer_atlas_algos/properties/divisor_sum.py +11 -0
  34. integer_atlas_algos/properties/dyadic_valuation.py +11 -0
  35. integer_atlas_algos/properties/euler_phi.py +14 -0
  36. integer_atlas_algos/properties/gcd_sum_pillai.py +17 -0
  37. integer_atlas_algos/properties/hex_repr.py +7 -0
  38. integer_atlas_algos/properties/integer_sqrt.py +9 -0
  39. integer_atlas_algos/properties/is_even.py +7 -0
  40. integer_atlas_algos/properties/is_fibonacci.py +15 -0
  41. integer_atlas_algos/properties/is_happy.py +14 -0
  42. integer_atlas_algos/properties/is_harshad.py +10 -0
  43. integer_atlas_algos/properties/is_odd.py +7 -0
  44. integer_atlas_algos/properties/is_palindrome.py +8 -0
  45. integer_atlas_algos/properties/is_perfect.py +12 -0
  46. integer_atlas_algos/properties/is_perfect_power.py +17 -0
  47. integer_atlas_algos/properties/is_powerful.py +10 -0
  48. integer_atlas_algos/properties/is_practical.py +18 -0
  49. integer_atlas_algos/properties/is_prime.py +9 -0
  50. integer_atlas_algos/properties/is_prime_power.py +8 -0
  51. integer_atlas_algos/properties/is_square.py +10 -0
  52. integer_atlas_algos/properties/is_squarefree.py +10 -0
  53. integer_atlas_algos/properties/is_triangular.py +11 -0
  54. integer_atlas_algos/properties/largest_prime_factor.py +11 -0
  55. integer_atlas_algos/properties/liouville_lambda.py +8 -0
  56. integer_atlas_algos/properties/mobius.py +13 -0
  57. integer_atlas_algos/properties/octal_repr.py +7 -0
  58. integer_atlas_algos/properties/omega_big.py +8 -0
  59. integer_atlas_algos/properties/omega_distinct.py +8 -0
  60. integer_atlas_algos/properties/partition_count.py +33 -0
  61. integer_atlas_algos/properties/radical.py +12 -0
  62. integer_atlas_algos/properties/sign.py +7 -0
  63. integer_atlas_algos/properties/smallest_prime_factor.py +11 -0
  64. integer_atlas_algos/properties/sum_of_two_squares_count.py +18 -0
  65. integer_atlas_algos/properties/von_mangoldt.py +14 -0
  66. integer_atlas_algos/registry.py +53 -0
  67. integer_atlas_algos-0.1.0.dist-info/METADATA +117 -0
  68. integer_atlas_algos-0.1.0.dist-info/RECORD +70 -0
  69. integer_atlas_algos-0.1.0.dist-info/WHEEL +4 -0
  70. integer_atlas_algos-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,3 @@
1
+ """Integer Atlas — stateless property methods and shard executor."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,172 @@
1
+ """Pure-Python BLAKE3 (hash mode), following the official reference design.
2
+
3
+ This is the dependency-free fallback used when the fast `blake3` package is not
4
+ installed, so a shard's blake3 hash is always populated. It is correct but slow;
5
+ the executor prefers the native library when available. Validated against the
6
+ official BLAKE3 test vectors (see tests/test_blake3.py).
7
+ """
8
+
9
+ IV = [0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
10
+ 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19]
11
+
12
+ MSG_PERMUTATION = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]
13
+
14
+ CHUNK_START = 1 << 0
15
+ CHUNK_END = 1 << 1
16
+ PARENT = 1 << 2
17
+ ROOT = 1 << 3
18
+
19
+ BLOCK_LEN = 64
20
+ CHUNK_LEN = 1024
21
+ MASK = 0xFFFFFFFF
22
+
23
+
24
+ def _rotr(x, n):
25
+ return ((x >> n) | (x << (32 - n))) & MASK
26
+
27
+
28
+ def _g(s, a, b, c, d, mx, my):
29
+ s[a] = (s[a] + s[b] + mx) & MASK
30
+ s[d] = _rotr(s[d] ^ s[a], 16)
31
+ s[c] = (s[c] + s[d]) & MASK
32
+ s[b] = _rotr(s[b] ^ s[c], 12)
33
+ s[a] = (s[a] + s[b] + my) & MASK
34
+ s[d] = _rotr(s[d] ^ s[a], 8)
35
+ s[c] = (s[c] + s[d]) & MASK
36
+ s[b] = _rotr(s[b] ^ s[c], 7)
37
+
38
+
39
+ def _round(s, m):
40
+ _g(s, 0, 4, 8, 12, m[0], m[1])
41
+ _g(s, 1, 5, 9, 13, m[2], m[3])
42
+ _g(s, 2, 6, 10, 14, m[4], m[5])
43
+ _g(s, 3, 7, 11, 15, m[6], m[7])
44
+ _g(s, 0, 5, 10, 15, m[8], m[9])
45
+ _g(s, 1, 6, 11, 12, m[10], m[11])
46
+ _g(s, 2, 7, 8, 13, m[12], m[13])
47
+ _g(s, 3, 4, 9, 14, m[14], m[15])
48
+
49
+
50
+ def _compress(cv, block_words, counter, block_len, flags):
51
+ s = [
52
+ cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
53
+ IV[0], IV[1], IV[2], IV[3],
54
+ counter & MASK, (counter >> 32) & MASK, block_len, flags,
55
+ ]
56
+ m = list(block_words)
57
+ for r in range(7):
58
+ _round(s, m)
59
+ if r < 6:
60
+ m = [m[MSG_PERMUTATION[i]] for i in range(16)]
61
+ for i in range(8):
62
+ s[i] ^= s[i + 8]
63
+ s[i + 8] ^= cv[i]
64
+ return s
65
+
66
+
67
+ def _words(block64):
68
+ return [int.from_bytes(block64[i:i + 4], "little") for i in range(0, 64, 4)]
69
+
70
+
71
+ class _Output:
72
+ __slots__ = ("cv", "block_words", "counter", "block_len", "flags")
73
+
74
+ def __init__(self, cv, block_words, counter, block_len, flags):
75
+ self.cv = cv
76
+ self.block_words = block_words
77
+ self.counter = counter
78
+ self.block_len = block_len
79
+ self.flags = flags
80
+
81
+ def chaining_value(self):
82
+ return _compress(self.cv, self.block_words, self.counter,
83
+ self.block_len, self.flags)[:8]
84
+
85
+ def root_bytes(self, length):
86
+ out = bytearray()
87
+ counter = 0
88
+ while length > 0:
89
+ words = _compress(self.cv, self.block_words, counter,
90
+ self.block_len, self.flags | ROOT)
91
+ block = b"".join(w.to_bytes(4, "little") for w in words)
92
+ take = min(len(block), length)
93
+ out += block[:take]
94
+ length -= take
95
+ counter += 1
96
+ return bytes(out)
97
+
98
+
99
+ class _ChunkState:
100
+ def __init__(self, counter):
101
+ self.cv = list(IV)
102
+ self.counter = counter
103
+ self.block = bytearray(BLOCK_LEN)
104
+ self.block_len = 0
105
+ self.blocks_compressed = 0
106
+
107
+ def length(self):
108
+ return BLOCK_LEN * self.blocks_compressed + self.block_len
109
+
110
+ def _start_flag(self):
111
+ return CHUNK_START if self.blocks_compressed == 0 else 0
112
+
113
+ def update(self, mv):
114
+ pos, n = 0, len(mv)
115
+ while pos < n:
116
+ if self.block_len == BLOCK_LEN:
117
+ self.cv = _compress(self.cv, _words(self.block), self.counter,
118
+ BLOCK_LEN, self._start_flag())[:8]
119
+ self.blocks_compressed += 1
120
+ self.block_len = 0
121
+ take = min(BLOCK_LEN - self.block_len, n - pos)
122
+ self.block[self.block_len:self.block_len + take] = mv[pos:pos + take]
123
+ self.block_len += take
124
+ pos += take
125
+
126
+ def output(self):
127
+ padded = bytes(self.block[:self.block_len]) + b"\x00" * (BLOCK_LEN - self.block_len)
128
+ flags = self._start_flag() | CHUNK_END
129
+ return _Output(self.cv, _words(padded), self.counter, self.block_len, flags)
130
+
131
+
132
+ def _parent_output(left_cv, right_cv):
133
+ return _Output(list(IV), left_cv + right_cv, 0, BLOCK_LEN, PARENT)
134
+
135
+
136
+ class Blake3:
137
+ def __init__(self):
138
+ self._chunk = _ChunkState(0)
139
+ self._stack = []
140
+
141
+ def _add_chunk_cv(self, new_cv, total_chunks):
142
+ while total_chunks & 1 == 0:
143
+ new_cv = _parent_output(self._stack.pop(), new_cv).chaining_value()
144
+ total_chunks >>= 1
145
+ self._stack.append(new_cv)
146
+
147
+ def update(self, data):
148
+ mv = memoryview(data)
149
+ pos, n = 0, len(mv)
150
+ while pos < n:
151
+ if self._chunk.length() == CHUNK_LEN:
152
+ cv = self._chunk.output().chaining_value()
153
+ total = self._chunk.counter + 1
154
+ self._add_chunk_cv(cv, total)
155
+ self._chunk = _ChunkState(total)
156
+ take = min(CHUNK_LEN - self._chunk.length(), n - pos)
157
+ self._chunk.update(mv[pos:pos + take])
158
+ pos += take
159
+ return self
160
+
161
+ def digest(self, length=32):
162
+ output = self._chunk.output()
163
+ for cv in reversed(self._stack):
164
+ output = _parent_output(cv, output.chaining_value())
165
+ return output.root_bytes(length)
166
+
167
+ def hexdigest(self, length=32):
168
+ return self.digest(length).hex()
169
+
170
+
171
+ def blake3_hexdigest(data, length=32):
172
+ return Blake3().update(data).hexdigest(length)
@@ -0,0 +1,87 @@
1
+ """Factorization backed by a precomputed base-primes table.
2
+
3
+ To factor any n up to BOUND**2 you only need primes up to BOUND. With
4
+ BOUND = 31623 (>= sqrt(1e9)) this covers the whole 0..1e9 range using ~3401
5
+ primes. The table is a deterministic, regenerable resource cached in
6
+ algos/precomputed/ — not state about any shard or pack. For n beyond BOUND**2
7
+ factorization stays correct by continuing trial division past the table.
8
+ """
9
+ import os
10
+
11
+ _PRECOMP_DIR = os.path.join(
12
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "precomputed")
13
+ DEFAULT_BOUND = 31623 # ceil(sqrt(1e9)); factors any n <= 1e9
14
+
15
+ _primes_cache: dict[int, list[int]] = {}
16
+
17
+
18
+ def _sieve(bound: int) -> list[int]:
19
+ flags = bytearray([1]) * (bound + 1)
20
+ flags[0] = flags[1] = 0
21
+ i = 2
22
+ while i * i <= bound:
23
+ if flags[i]:
24
+ flags[i * i::i] = bytearray(len(flags[i * i::i]))
25
+ i += 1
26
+ return [i for i in range(2, bound + 1) if flags[i]]
27
+
28
+
29
+ def small_primes(bound: int = DEFAULT_BOUND) -> list[int]:
30
+ """Primes <= bound, cached in memory and persisted under precomputed/."""
31
+ cached = _primes_cache.get(bound)
32
+ if cached is not None:
33
+ return cached
34
+ path = os.path.join(_PRECOMP_DIR, f"primes_le_{bound}.txt")
35
+ if os.path.exists(path):
36
+ with open(path) as f:
37
+ primes = [int(x) for x in f.read().split()]
38
+ else:
39
+ primes = _sieve(bound)
40
+ try:
41
+ os.makedirs(_PRECOMP_DIR, exist_ok=True)
42
+ tmp = path + ".tmp"
43
+ with open(tmp, "w") as f:
44
+ f.write("\n".join(map(str, primes)))
45
+ os.replace(tmp, path)
46
+ except OSError:
47
+ pass # read-only filesystem: fall back to in-memory only
48
+ _primes_cache[bound] = primes
49
+ return primes
50
+
51
+
52
+ def factorize(m: int) -> dict[int, int]:
53
+ """prime -> exponent for abs(m); empty dict for 0 and 1."""
54
+ m = abs(int(m))
55
+ f: dict[int, int] = {}
56
+ if m < 2:
57
+ return f
58
+ primes = small_primes()
59
+ broke = False
60
+ for p in primes:
61
+ if p * p > m:
62
+ broke = True
63
+ break
64
+ if m % p == 0:
65
+ c = 0
66
+ while m % p == 0:
67
+ m //= p
68
+ c += 1
69
+ f[p] = c
70
+ if m > 1:
71
+ if broke or m <= primes[-1] * primes[-1]:
72
+ # remaining cofactor is prime (it has no factor <= sqrt(m))
73
+ f[m] = f.get(m, 0) + 1
74
+ else:
75
+ # n exceeded the table's reach: keep trial-dividing past the bound
76
+ d = primes[-1] + 2
77
+ while d * d <= m:
78
+ if m % d == 0:
79
+ c = 0
80
+ while m % d == 0:
81
+ m //= d
82
+ c += 1
83
+ f[d] = c
84
+ d += 2
85
+ if m > 1:
86
+ f[m] = f.get(m, 0) + 1
87
+ return f
@@ -0,0 +1,9 @@
1
+ """Shared multiplicative-function helpers derived from a factorization."""
2
+
3
+
4
+ def sigma(factorization):
5
+ """Sum of divisors from prime -> exponent (1 for the empty factorization)."""
6
+ s = 1
7
+ for p, e in factorization.items():
8
+ s *= (p ** (e + 1) - 1) // (p - 1)
9
+ return s
@@ -0,0 +1,28 @@
1
+ """Per-n memoized context passed to every method.
2
+
3
+ Shared, expensive intermediates (factorization, etc.) are computed once per n
4
+ and reused across methods that declare them via `requires`. Cheap methods just
5
+ read ctx.n / ctx.abs_n and ignore the rest.
6
+ """
7
+ from functools import cached_property
8
+
9
+ from integer_atlas_algos._lib.factorization import factorize
10
+
11
+
12
+ class Context:
13
+ def __init__(self, n: int):
14
+ self.n = n
15
+ self.abs_n = abs(n)
16
+
17
+ @cached_property
18
+ def factorization(self) -> dict[int, int]:
19
+ """prime -> exponent for abs(n); empty for 0 and 1."""
20
+ return factorize(self.abs_n)
21
+
22
+ @cached_property
23
+ def divisors(self) -> list[int]:
24
+ """Sorted positive divisors of abs(n) (for abs(n) >= 1)."""
25
+ divs = [1]
26
+ for p, e in self.factorization.items():
27
+ divs = [d * p ** k for d in divs for k in range(e + 1)]
28
+ return sorted(divs)
File without changes
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+ from integer_atlas_algos.executor.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
@@ -0,0 +1,91 @@
1
+ """Crash-safe file primitives.
2
+
3
+ Every durable write goes to a temp file in the same directory, is fsync'd, then
4
+ atomically renamed into place (os.replace). The containing directory is fsync'd
5
+ so the rename itself survives a crash. A partial temp file is never visible under
6
+ the real name, so a reader either sees the previous version or the new one.
7
+ """
8
+ import json
9
+ import os
10
+ import tempfile
11
+
12
+
13
+ def _fsync_file(path):
14
+ fd = os.open(path, os.O_RDONLY)
15
+ try:
16
+ os.fsync(fd)
17
+ finally:
18
+ os.close(fd)
19
+
20
+
21
+ def _fsync_dir(d):
22
+ try:
23
+ fd = os.open(d, os.O_RDONLY)
24
+ try:
25
+ os.fsync(fd)
26
+ finally:
27
+ os.close(fd)
28
+ except OSError:
29
+ # Some platforms disallow fsync on a directory; the rename is still atomic.
30
+ pass
31
+
32
+
33
+ def _safe_unlink(path):
34
+ try:
35
+ os.unlink(path)
36
+ except FileNotFoundError:
37
+ pass
38
+
39
+
40
+ def atomic_write_bytes(path, data: bytes):
41
+ d = os.path.dirname(os.path.abspath(path))
42
+ fd, tmp = tempfile.mkstemp(dir=d, prefix=".tmp-", suffix=".part")
43
+ try:
44
+ with os.fdopen(fd, "wb") as f:
45
+ f.write(data)
46
+ f.flush()
47
+ os.fsync(f.fileno())
48
+ os.replace(tmp, path)
49
+ _fsync_dir(d)
50
+ except BaseException:
51
+ _safe_unlink(tmp)
52
+ raise
53
+
54
+
55
+ def atomic_write_text(path, text: str):
56
+ atomic_write_bytes(path, text.encode("utf-8"))
57
+
58
+
59
+ def atomic_write_json(path, obj):
60
+ atomic_write_text(path, json.dumps(obj, indent=2, sort_keys=True))
61
+
62
+
63
+ def atomic_produce(path, writer):
64
+ """Atomically create `path` via writer(tmp_path), which writes the whole file.
65
+
66
+ Used for backends (e.g. Parquet) that write their own file format.
67
+ """
68
+ d = os.path.dirname(os.path.abspath(path))
69
+ fd, tmp = tempfile.mkstemp(dir=d, prefix=".tmp-", suffix=".part")
70
+ os.close(fd)
71
+ try:
72
+ writer(tmp)
73
+ _fsync_file(tmp)
74
+ os.replace(tmp, path)
75
+ _fsync_dir(d)
76
+ except BaseException:
77
+ _safe_unlink(tmp)
78
+ raise
79
+
80
+
81
+ def read_json(path):
82
+ with open(path) as f:
83
+ return json.load(f)
84
+
85
+
86
+ def cleanup_tmp(directory):
87
+ if not os.path.isdir(directory):
88
+ return
89
+ for name in os.listdir(directory):
90
+ if name.startswith(".tmp-"):
91
+ _safe_unlink(os.path.join(directory, name))
@@ -0,0 +1,24 @@
1
+ """Pluggable shard file backends.
2
+
3
+ The resume/checkpoint/atomic machinery in compute.py is format-agnostic; a
4
+ backend only knows how to write/read/count one file. `csv` is stdlib and always
5
+ available (handy for dev and tests); `parquet` is the real shard format and
6
+ needs pyarrow.
7
+ """
8
+
9
+
10
+ def get_backend(name):
11
+ if name == "csv":
12
+ from . import csv_backend
13
+ return csv_backend
14
+ if name == "parquet":
15
+ try:
16
+ import pyarrow # noqa: F401
17
+ except ImportError as e:
18
+ raise RuntimeError(
19
+ "parquet format requires pyarrow (pip install pyarrow). "
20
+ "For a dependency-free run use --format csv."
21
+ ) from e
22
+ from . import parquet_backend
23
+ return parquet_backend
24
+ raise ValueError(f"unknown format: {name}")
@@ -0,0 +1,67 @@
1
+ """Stdlib CSV backend. Deterministic output; types coerced via the schema."""
2
+ import csv
3
+
4
+ EXT = ".csv"
5
+ COMPRESSION = "none"
6
+
7
+
8
+ def _fmt(v, dtype):
9
+ if dtype == "bool":
10
+ return "true" if v else "false"
11
+ return str(v)
12
+
13
+
14
+ def _parse(s, dtype):
15
+ if dtype == "bool":
16
+ return s == "true"
17
+ if dtype in ("double", "float64"):
18
+ return float(s)
19
+ if dtype == "string":
20
+ return s
21
+ return int(s) # int*, uint*, and bigint (Python ints are unbounded)
22
+
23
+
24
+ def write_table(path, schema, rows):
25
+ names = [n for n, _ in schema]
26
+ with open(path, "w", newline="") as f:
27
+ w = csv.writer(f)
28
+ w.writerow(names)
29
+ for r in rows:
30
+ w.writerow([_fmt(r[n], dt) for n, dt in schema])
31
+
32
+
33
+ class Writer:
34
+ """Streaming writer: write parts one at a time without buffering the whole file."""
35
+
36
+ def __init__(self, path, schema):
37
+ self.schema = schema
38
+ self.f = open(path, "w", newline="")
39
+ self.w = csv.writer(self.f)
40
+ self.w.writerow([n for n, _ in schema])
41
+
42
+ def write_table(self, rows):
43
+ for r in rows:
44
+ self.w.writerow([_fmt(r[n], dt) for n, dt in self.schema])
45
+
46
+ def close(self):
47
+ self.f.close()
48
+
49
+
50
+ def open_writer(path, schema):
51
+ return Writer(path, schema)
52
+
53
+
54
+ def read_table(path, schema):
55
+ dtypes = dict(schema)
56
+ out = []
57
+ with open(path, newline="") as f:
58
+ rd = csv.reader(f)
59
+ header = next(rd)
60
+ for row in rd:
61
+ out.append({n: _parse(v, dtypes.get(n, "int64")) for n, v in zip(header, row)})
62
+ return out
63
+
64
+
65
+ def count_rows(path):
66
+ with open(path) as f:
67
+ return max(0, sum(1 for _ in f) - 1)
@@ -0,0 +1,87 @@
1
+ """Parquet backend (pyarrow). The real shard format.
2
+
3
+ Exercised only when pyarrow is installed; the control flow it plugs into is
4
+ covered by the CSV-backed tests.
5
+ """
6
+ EXT = ".parquet"
7
+ COMPRESSION = "zstd"
8
+
9
+
10
+ def _arrow_types():
11
+ import pyarrow as pa
12
+ return {
13
+ "int8": pa.int8(), "int16": pa.int16(), "int32": pa.int32(), "int64": pa.int64(),
14
+ "uint8": pa.uint8(), "uint16": pa.uint16(), "uint32": pa.uint32(), "uint64": pa.uint64(),
15
+ "bool": pa.bool_(), "double": pa.float64(), "float64": pa.float64(),
16
+ # bigint values can exceed any fixed width, so store them as decimal strings.
17
+ "string": pa.string(), "bigint": pa.string(),
18
+ }
19
+
20
+
21
+ def _rows_to_table(schema, rows):
22
+ import pyarrow as pa
23
+
24
+ t = _arrow_types()
25
+ cols = {n: [] for n, _ in schema}
26
+ for r in rows:
27
+ for n, dt in schema:
28
+ cols[n].append(str(r[n]) if dt == "bigint" else r[n])
29
+ arrays = [pa.array(cols[n], type=t[dt]) for n, dt in schema]
30
+ return pa.table(arrays, names=[n for n, _ in schema])
31
+
32
+
33
+ def write_table(path, schema, rows):
34
+ import pyarrow.parquet as pq
35
+
36
+ pq.write_table(_rows_to_table(schema, rows), path, compression=COMPRESSION)
37
+
38
+
39
+ class Writer:
40
+ """Streaming writer: each part becomes a row group; memory stays bounded."""
41
+
42
+ def __init__(self, path, schema):
43
+ import pyarrow.parquet as pq
44
+
45
+ self.schema = schema
46
+ self._pq = pq
47
+ self._writer = None
48
+ self._path = path
49
+
50
+ def write_table(self, rows):
51
+ table = _rows_to_table(self.schema, rows)
52
+ if self._writer is None:
53
+ self._writer = self._pq.ParquetWriter(self._path, table.schema,
54
+ compression=COMPRESSION)
55
+ self._writer.write_table(table)
56
+
57
+ def close(self):
58
+ if self._writer is not None:
59
+ self._writer.close()
60
+
61
+
62
+ def open_writer(path, schema):
63
+ return Writer(path, schema)
64
+
65
+
66
+ def read_table(path, schema):
67
+ import pyarrow.parquet as pq
68
+
69
+ cols = pq.read_table(path).to_pydict()
70
+ dtypes = dict(schema)
71
+ n = len(next(iter(cols.values()))) if cols else 0
72
+ rows = []
73
+ for i in range(n):
74
+ row = {}
75
+ for k in cols:
76
+ v = cols[k][i]
77
+ if dtypes.get(k) == "bigint" and v is not None:
78
+ v = int(v) # stored as decimal string; restore exact integer
79
+ row[k] = v
80
+ rows.append(row)
81
+ return rows
82
+
83
+
84
+ def count_rows(path):
85
+ import pyarrow.parquet as pq
86
+
87
+ return pq.ParquetFile(path).metadata.num_rows