hyperglyph-codec 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hyperglyph/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """Hyper Glyph package."""
2
+
3
+ from .codec import CompressedModel, CompressedTensor, CompressionReport, HyperGlyphCodec
4
+ from .config import HyperGlyphConfig
5
+ from .serialization import load_compressed, save_compressed
6
+ from .torch_adapter import compress_state_dict, decompress_state_dict
7
+
8
+ __all__ = [
9
+ "HyperGlyphCodec",
10
+ "HyperGlyphConfig",
11
+ "CompressionReport",
12
+ "CompressedModel",
13
+ "CompressedTensor",
14
+ "compress_state_dict",
15
+ "decompress_state_dict",
16
+ "save_compressed",
17
+ "load_compressed",
18
+ ]
19
+ __version__ = "0.1.0"
hyperglyph/blocks.py ADDED
@@ -0,0 +1,49 @@
1
+ """Helpers for splitting arrays into blocks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def pad_tensor_to_blocks(array: np.ndarray, block_size: int) -> tuple[np.ndarray, tuple[int, ...]]:
9
+ """Pad a flattened tensor so its size is divisible by block_size."""
10
+ if block_size <= 0:
11
+ raise ValueError("block_size must be positive")
12
+ flattened = array.reshape(-1)
13
+ remainder = flattened.size % block_size
14
+ if remainder == 0:
15
+ return flattened.copy(), tuple(array.shape)
16
+ padded_size = flattened.size + (block_size - remainder)
17
+ padded = np.pad(flattened, (0, padded_size - flattened.size), mode="constant")
18
+ return padded.astype(np.float32), tuple(array.shape)
19
+
20
+
21
+ def split_array_blocks(array: np.ndarray, block_size: int) -> list[np.ndarray]:
22
+ """Split a flat or regular array into blocks of size block_size."""
23
+ padded, _ = pad_tensor_to_blocks(array, block_size)
24
+ flat_padded = padded.reshape(-1)
25
+ return [
26
+ flat_padded[index : index + block_size] for index in range(0, flat_padded.size, block_size)
27
+ ]
28
+
29
+
30
+ def merge_array_blocks(
31
+ blocks: list[np.ndarray],
32
+ original_shape: tuple[int, ...],
33
+ padded_shape: tuple[int, ...],
34
+ block_size: int,
35
+ ) -> np.ndarray:
36
+ """Merge blocks back into their original shape."""
37
+ flat = np.concatenate(blocks, axis=0)
38
+ flat = flat[: int(np.prod(original_shape))]
39
+ return flat.reshape(original_shape)
40
+
41
+
42
+ def flatten_tensor_for_blocks(array: np.ndarray) -> np.ndarray:
43
+ """Flatten tensors to 1D for block processing."""
44
+ return np.asarray(array, dtype=np.float32).reshape(-1)
45
+
46
+
47
+ def restore_tensor_shape(array: np.ndarray, original_shape: tuple[int, ...]) -> np.ndarray:
48
+ """Restore a flattened vector to the original shape."""
49
+ return np.asarray(array, dtype=np.float32).reshape(original_shape)
hyperglyph/cli.py ADDED
@@ -0,0 +1,112 @@
1
+ """Command-line interface for Hyper Glyph."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from typing import Sequence
8
+
9
+ try:
10
+ import torch # type: ignore
11
+ except ImportError: # pragma: no cover - optional dependency path
12
+ torch = None
13
+
14
+ from .codec import HyperGlyphCodec
15
+ from .config import HyperGlyphConfig
16
+ from .serialization import load_compressed, save_compressed
17
+
18
+
19
+ def build_parser() -> argparse.ArgumentParser:
20
+ """Build the CLI parser."""
21
+ parser = argparse.ArgumentParser(prog="hyperglyph", description="Hyper Glyph compression CLI")
22
+ subparsers = parser.add_subparsers(dest="command", required=True)
23
+
24
+ compress_parser = subparsers.add_parser("compress")
25
+ compress_parser.add_argument("input", help="Input torch state dict file (.pt)")
26
+ compress_parser.add_argument("output", help="Output .hwz file")
27
+ compress_parser.add_argument("--block-size", type=int, default=16)
28
+ compress_parser.add_argument("--hdc-dim", type=int, default=4096)
29
+ compress_parser.add_argument("--n-buckets", type=int, default=16)
30
+ compress_parser.add_argument("--n-prototypes", type=int, default=128)
31
+ compress_parser.add_argument("--residual-k", type=int, default=8)
32
+ compress_parser.add_argument("--seed", type=int, default=42)
33
+ compress_parser.add_argument("--compress-bias", action="store_true")
34
+ compress_parser.add_argument("--min-tensor-size", type=int, default=256)
35
+
36
+ decompress_parser = subparsers.add_parser("decompress")
37
+ decompress_parser.add_argument("input", help="Input .hwz file")
38
+ decompress_parser.add_argument("output", help="Output torch state dict file (.pt)")
39
+
40
+ inspect_parser = subparsers.add_parser("inspect")
41
+ inspect_parser.add_argument("input", help="Input .hwz file")
42
+
43
+ benchmark_parser = subparsers.add_parser("benchmark")
44
+ benchmark_parser.add_argument("input", help="Input torch state dict file (.pt)")
45
+
46
+ return parser
47
+
48
+
49
+ def main(argv: Sequence[str] | None = None) -> int:
50
+ """Run the CLI."""
51
+ parser = build_parser()
52
+ args = parser.parse_args(list(argv) if argv is not None else None)
53
+
54
+ if args.command == "compress":
55
+ if torch is None:
56
+ raise SystemExit("PyTorch is required for the compress/decompress CLI commands")
57
+ state_dict = torch.load(args.input, map_location="cpu")
58
+ config = HyperGlyphConfig(
59
+ hdc_dim=args.hdc_dim,
60
+ block_size=args.block_size,
61
+ n_buckets=args.n_buckets,
62
+ n_prototypes=args.n_prototypes,
63
+ residual_k=args.residual_k,
64
+ seed=args.seed,
65
+ compress_bias=args.compress_bias,
66
+ min_tensor_size=args.min_tensor_size,
67
+ )
68
+ codec = HyperGlyphCodec(config)
69
+ compressed = codec.compress_state_dict(state_dict)
70
+ save_compressed(compressed, args.output)
71
+ print(f"Saved compressed model to {args.output}")
72
+ return 0
73
+
74
+ if args.command == "decompress":
75
+ if torch is None:
76
+ raise SystemExit("PyTorch is required for the compress/decompress CLI commands")
77
+ compressed = load_compressed(args.input)
78
+ restored = HyperGlyphCodec().decompress_state_dict(compressed)
79
+ torch.save(restored, args.output)
80
+ print(f"Restored state dict to {args.output}")
81
+ return 0
82
+
83
+ if args.command == "inspect":
84
+ compressed = load_compressed(args.input)
85
+ print(
86
+ json.dumps(
87
+ {
88
+ "format_version": compressed.format_version,
89
+ "tensor_count": len(compressed.tensors),
90
+ },
91
+ indent=2,
92
+ )
93
+ )
94
+ return 0
95
+
96
+ if args.command == "benchmark":
97
+ if torch is None:
98
+ raise SystemExit("PyTorch is required for benchmark CLI commands")
99
+ state_dict = torch.load(args.input, map_location="cpu")
100
+ codec = HyperGlyphCodec()
101
+ compressed = codec.compress_state_dict(state_dict)
102
+ restored = codec.decompress_state_dict(compressed)
103
+ report = codec.report(compressed, state_dict, restored)
104
+ print(report)
105
+ return 0
106
+
107
+ parser.error("unknown command")
108
+ return 2
109
+
110
+
111
+ if __name__ == "__main__":
112
+ raise SystemExit(main())
hyperglyph/codec.py ADDED
@@ -0,0 +1,200 @@
1
+ """Main compression codec."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, Mapping
8
+
9
+ import numpy as np
10
+
11
+ from .blocks import restore_tensor_shape, split_array_blocks
12
+ from .config import HyperGlyphConfig
13
+ from .metrics import (
14
+ compressed_size_bytes,
15
+ compression_ratio,
16
+ mae,
17
+ max_abs_error,
18
+ mse,
19
+ original_size_bytes,
20
+ )
21
+ from .prototypes import assign_prototypes, learn_prototypes, reconstruct_from_prototypes
22
+ from .residual import apply_residual, compute_topk_residual, serialize_residual
23
+
24
+
25
+ @dataclass(slots=True)
26
+ class CompressedTensor:
27
+ """Compressed representation for a single tensor."""
28
+
29
+ name: str
30
+ shape: tuple[int, ...]
31
+ block_size: int
32
+ prototype_ids: list[int]
33
+ scales: list[float]
34
+ residuals: list[dict[str, Any]]
35
+ prototype_matrix: np.ndarray
36
+ seed: int
37
+ codec_config: dict[str, Any]
38
+
39
+
40
+ @dataclass(slots=True)
41
+ class CompressedModel:
42
+ """Compressed representation of a model state_dict."""
43
+
44
+ tensors: dict[str, CompressedTensor]
45
+ payload: bytes = field(default_factory=bytes)
46
+ format_version: str = "0.1"
47
+
48
+
49
+ @dataclass(slots=True)
50
+ class CompressionReport:
51
+ """Report about a compression run."""
52
+
53
+ original_bytes: int
54
+ compressed_bytes: int
55
+ compression_ratio: float
56
+ tensors_compressed: int
57
+ tensors_skipped: int
58
+ total_mse: float
59
+ total_mae: float
60
+ max_abs_error: float
61
+
62
+
63
+ class HyperGlyphCodec:
64
+ """A simple experimental compression codec for weight tensors."""
65
+
66
+ def __init__(self, config: HyperGlyphConfig | None = None) -> None:
67
+ self.config = config or HyperGlyphConfig()
68
+
69
+ def compress_array(self, name: str, array: np.ndarray) -> CompressedTensor:
70
+ """Compress a single NumPy array."""
71
+ array = np.asarray(array, dtype=np.float32)
72
+ if array.size < self.config.min_tensor_size:
73
+ raise ValueError("tensor too small to compress")
74
+
75
+ blocks = split_array_blocks(array, self.config.block_size)
76
+ if not blocks:
77
+ raise ValueError("no blocks available")
78
+
79
+ block_matrix = np.stack([np.asarray(block, dtype=np.float32) for block in blocks], axis=0)
80
+ prototypes = learn_prototypes(block_matrix, self.config.n_prototypes, self.config.seed)
81
+ assignments = assign_prototypes(block_matrix, prototypes)
82
+ reconstructed_prototypes = reconstruct_from_prototypes(assignments, prototypes)
83
+
84
+ prototype_ids: list[int] = [int(idx) for idx in assignments]
85
+ scales: list[float] = []
86
+ residuals: list[dict[str, Any]] = []
87
+ for idx, block in enumerate(blocks):
88
+ proto = reconstructed_prototypes[idx]
89
+ block_norm = float(np.linalg.norm(block))
90
+ proto_norm = max(float(np.linalg.norm(proto)), 1e-6)
91
+ scale = block_norm / proto_norm
92
+ scales.append(scale)
93
+ proto_scaled = proto * scale
94
+ residual = compute_topk_residual(block, proto_scaled, self.config.residual_k)
95
+ residuals.append(serialize_residual(residual))
96
+
97
+ return CompressedTensor(
98
+ name=name,
99
+ shape=tuple(array.shape),
100
+ block_size=self.config.block_size,
101
+ prototype_ids=prototype_ids,
102
+ scales=scales,
103
+ residuals=residuals,
104
+ prototype_matrix=prototypes,
105
+ seed=self.config.seed,
106
+ codec_config={
107
+ "hdc_dim": self.config.hdc_dim,
108
+ "block_size": self.config.block_size,
109
+ "n_buckets": self.config.n_buckets,
110
+ "n_prototypes": self.config.n_prototypes,
111
+ "residual_k": self.config.residual_k,
112
+ "seed": self.config.seed,
113
+ "dtype": self.config.dtype,
114
+ "device": self.config.device,
115
+ },
116
+ )
117
+
118
+ def decompress_array(self, compressed: CompressedTensor) -> np.ndarray:
119
+ """Decompress a single tensor."""
120
+ if not compressed.prototype_matrix.size:
121
+ return np.zeros(compressed.shape, dtype=np.float32)
122
+
123
+ prototype_vectors: np.ndarray = compressed.prototype_matrix.astype(np.float32)
124
+ reconstructed_blocks: list[np.ndarray] = []
125
+ for idx, prototype_id in enumerate(compressed.prototype_ids):
126
+ prototype = prototype_vectors[prototype_id]
127
+ scale = compressed.scales[idx]
128
+ block = prototype * scale
129
+ block = apply_residual(block, compressed.residuals[idx])
130
+ reconstructed_blocks.append(block)
131
+
132
+ flat = np.concatenate(reconstructed_blocks, axis=0)
133
+ return restore_tensor_shape(flat[: int(np.prod(compressed.shape))], compressed.shape)
134
+
135
+ def compress_state_dict(self, state_dict: Mapping[str, Any]) -> CompressedModel:
136
+ """Compress an entire state_dict."""
137
+ compressed_tensors: dict[str, CompressedTensor] = {}
138
+ for name, tensor in state_dict.items():
139
+ if not self._should_compress(name, tensor):
140
+ continue
141
+ compressed_tensors[name] = self.compress_array(
142
+ name, np.asarray(tensor, dtype=np.float32)
143
+ )
144
+ payload = json.dumps({"tensors": list(compressed_tensors)}).encode("utf-8")
145
+ return CompressedModel(tensors=compressed_tensors, payload=payload)
146
+
147
+ def decompress_state_dict(self, compressed_model: CompressedModel) -> dict[str, np.ndarray]:
148
+ """Reconstruct a state_dict from compressed data."""
149
+ restored: dict[str, np.ndarray] = {}
150
+ for name, compressed in compressed_model.tensors.items():
151
+ restored[name] = self.decompress_array(compressed)
152
+ return restored
153
+
154
+ def report(
155
+ self,
156
+ compressed_model: CompressedModel,
157
+ original_state_dict: Mapping[str, Any] | None = None,
158
+ restored_state_dict: Mapping[str, Any] | None = None,
159
+ ) -> CompressionReport:
160
+ """Create a report summarizing compression quality and size."""
161
+ original_bytes = original_size_bytes(original_state_dict or {})
162
+ compressed_bytes = compressed_size_bytes(compressed_model)
163
+ ratio = compression_ratio(original_bytes, compressed_bytes)
164
+ tensors_compressed = len(compressed_model.tensors)
165
+ tensors_skipped = 0
166
+ if original_state_dict is not None:
167
+ tensors_skipped = sum(
168
+ 1 for name in original_state_dict if name not in compressed_model.tensors
169
+ )
170
+
171
+ total_mse = 0.0
172
+ total_mae = 0.0
173
+ max_error = 0.0
174
+ if original_state_dict is not None and restored_state_dict is not None:
175
+ for name in compressed_model.tensors:
176
+ if name in original_state_dict and name in restored_state_dict:
177
+ original = np.asarray(original_state_dict[name], dtype=np.float32)
178
+ restored = np.asarray(restored_state_dict[name], dtype=np.float32)
179
+ total_mse += mse(original, restored)
180
+ total_mae += mae(original, restored)
181
+ max_error = max(max_error, max_abs_error(original, restored))
182
+ return CompressionReport(
183
+ original_bytes=original_bytes,
184
+ compressed_bytes=compressed_bytes,
185
+ compression_ratio=ratio,
186
+ tensors_compressed=tensors_compressed,
187
+ tensors_skipped=tensors_skipped,
188
+ total_mse=total_mse,
189
+ total_mae=total_mae,
190
+ max_abs_error=max_error,
191
+ )
192
+
193
+ def _should_compress(self, name: str, tensor: Any) -> bool:
194
+ if not hasattr(tensor, "shape"):
195
+ return False
196
+ if self.config.compress_bias:
197
+ return int(np.prod(tensor.shape)) >= self.config.min_tensor_size
198
+ return (
199
+ "bias" not in name.lower() and int(np.prod(tensor.shape)) >= self.config.min_tensor_size
200
+ )
hyperglyph/config.py ADDED
@@ -0,0 +1,37 @@
1
+ """Configuration dataclasses for Hyper Glyph."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(slots=True)
9
+ class HyperGlyphConfig:
10
+ """Configuration for the Hyper Glyph compression codec."""
11
+
12
+ hdc_dim: int = 4096
13
+ block_size: int = 16
14
+ n_buckets: int = 16
15
+ n_prototypes: int = 128
16
+ residual_k: int = 8
17
+ seed: int = 42
18
+ min_tensor_size: int = 256
19
+ compress_bias: bool = False
20
+ dtype: str = "float32"
21
+ device: str = "cpu"
22
+
23
+ def __post_init__(self) -> None:
24
+ if self.hdc_dim <= 0:
25
+ raise ValueError("hdc_dim must be positive")
26
+ if self.block_size <= 0:
27
+ raise ValueError("block_size must be positive")
28
+ if self.n_buckets <= 0:
29
+ raise ValueError("n_buckets must be positive")
30
+ if self.n_prototypes <= 0:
31
+ raise ValueError("n_prototypes must be positive")
32
+ if self.residual_k < 0:
33
+ raise ValueError("residual_k must be non-negative")
34
+ if self.min_tensor_size <= 0:
35
+ raise ValueError("min_tensor_size must be positive")
36
+ if self.dtype not in {"float32", "float64"}:
37
+ raise ValueError("dtype must be 'float32' or 'float64'")
@@ -0,0 +1,9 @@
1
+ """Exceptions for the Hyper Glyph package."""
2
+
3
+
4
+ class HyperGlyphError(Exception):
5
+ """Base exception for Hyper Glyph."""
6
+
7
+
8
+ class OptionalDependencyError(HyperGlyphError):
9
+ """Raised when optional dependencies are missing."""
hyperglyph/hdc.py ADDED
@@ -0,0 +1,69 @@
1
+ """Simple deterministic hyperdimensional computing helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from typing import Iterable
7
+
8
+ import numpy as np
9
+
10
+
11
+ def _hash_seed(name: str, index: int, seed: int) -> int:
12
+ payload = f"{name}:{index}:{seed}".encode("utf-8")
13
+ return int(hashlib.sha256(payload).hexdigest()[:8], 16)
14
+
15
+
16
+ def make_role_vector(name: str, index: int, dim: int, seed: int) -> np.ndarray:
17
+ """Create a deterministic bipolar role vector."""
18
+ rng = np.random.default_rng(_hash_seed(name, index, seed))
19
+ vector = rng.choice([-1.0, 1.0], size=dim)
20
+ return vector.astype(np.float32)
21
+
22
+
23
+ def bind(a: np.ndarray, b: np.ndarray) -> np.ndarray:
24
+ """Bind two bipolar vectors via elementwise multiplication."""
25
+ return a * b
26
+
27
+
28
+ def bind_many(vectors: Iterable[np.ndarray]) -> np.ndarray:
29
+ """Bind a sequence of vectors together."""
30
+ vectors = list(vectors)
31
+ if not vectors:
32
+ raise ValueError("at least one vector is required")
33
+ result = np.ones_like(vectors[0], dtype=np.float32)
34
+ for vector in vectors:
35
+ result = result * vector
36
+ return result
37
+
38
+
39
+ def bundle(vectors: Iterable[np.ndarray]) -> np.ndarray:
40
+ """Bundle vectors by elementwise summation and sign."""
41
+ vectors = list(vectors)
42
+ if not vectors:
43
+ raise ValueError("at least one vector is required")
44
+ result = np.zeros_like(vectors[0], dtype=np.float32)
45
+ for vector in vectors:
46
+ result = result + vector
47
+ return np.sign(result)
48
+
49
+
50
+ def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
51
+ """Compute cosine similarity between two vectors."""
52
+ a = np.asarray(a, dtype=np.float32).ravel()
53
+ b = np.asarray(b, dtype=np.float32).ravel()
54
+ denom = np.linalg.norm(a) * np.linalg.norm(b)
55
+ if denom == 0:
56
+ return 0.0
57
+ return float(np.dot(a, b) / denom)
58
+
59
+
60
+ def hamming_distance(a: np.ndarray, b: np.ndarray) -> int:
61
+ """Compute Hamming distance between two bipolar vectors."""
62
+ a = np.asarray(a)
63
+ b = np.asarray(b)
64
+ return int(np.count_nonzero(a != b))
65
+
66
+
67
+ def binarize(v: np.ndarray) -> np.ndarray:
68
+ """Binarize a vector to {-1, 1}."""
69
+ return np.sign(np.asarray(v, dtype=np.float32)).astype(np.int8)
hyperglyph/metrics.py ADDED
@@ -0,0 +1,60 @@
1
+ """Metrics for compression quality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Mapping
6
+
7
+ import numpy as np
8
+
9
+
10
+ def original_size_bytes(state_dict: Mapping[str, np.ndarray]) -> int:
11
+ """Estimate the byte size of a state_dict."""
12
+ total = 0
13
+ for tensor in state_dict.values():
14
+ total += np.asarray(tensor).nbytes
15
+ return total
16
+
17
+
18
+ def compressed_size_bytes(compressed_model: object) -> int:
19
+ """Estimate the compressed size in bytes."""
20
+ if isinstance(compressed_model, Mapping):
21
+ return len(compressed_model.get("payload", b""))
22
+ return 0
23
+
24
+
25
+ def compression_ratio(original_bytes: int, compressed_bytes: int) -> float:
26
+ """Compute compression ratio as original / compressed."""
27
+ if compressed_bytes <= 0:
28
+ return float("inf")
29
+ return original_bytes / compressed_bytes
30
+
31
+
32
+ def mse(original: np.ndarray, reconstructed: np.ndarray) -> float:
33
+ """Compute mean squared error."""
34
+ original = np.asarray(original, dtype=np.float32)
35
+ reconstructed = np.asarray(reconstructed, dtype=np.float32)
36
+ return float(np.mean((original - reconstructed) ** 2))
37
+
38
+
39
+ def mae(original: np.ndarray, reconstructed: np.ndarray) -> float:
40
+ """Compute mean absolute error."""
41
+ original = np.asarray(original, dtype=np.float32)
42
+ reconstructed = np.asarray(reconstructed, dtype=np.float32)
43
+ return float(np.mean(np.abs(original - reconstructed)))
44
+
45
+
46
+ def max_abs_error(original: np.ndarray, reconstructed: np.ndarray) -> float:
47
+ """Compute maximum absolute error."""
48
+ original = np.asarray(original, dtype=np.float32)
49
+ reconstructed = np.asarray(reconstructed, dtype=np.float32)
50
+ return float(np.max(np.abs(original - reconstructed)))
51
+
52
+
53
+ def cosine_weight_similarity(original: np.ndarray, reconstructed: np.ndarray) -> float:
54
+ """Compute cosine similarity between two arrays."""
55
+ original = np.asarray(original, dtype=np.float32).ravel()
56
+ reconstructed = np.asarray(reconstructed, dtype=np.float32).ravel()
57
+ denom = np.linalg.norm(original) * np.linalg.norm(reconstructed)
58
+ if denom == 0:
59
+ return 0.0
60
+ return float(np.dot(original, reconstructed) / denom)
@@ -0,0 +1,60 @@
1
+ """Prototype learning and reconstruction helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def _init_prototypes(blocks: np.ndarray, n_prototypes: int, seed: int) -> np.ndarray:
9
+ rng = np.random.default_rng(seed)
10
+ indices = rng.choice(len(blocks), size=min(n_prototypes, len(blocks)), replace=False)
11
+ return blocks[indices].astype(np.float32).copy()
12
+
13
+
14
+ def learn_prototypes(blocks: np.ndarray, n_prototypes: int, seed: int) -> np.ndarray:
15
+ """Learn deterministic prototypes from a set of blocks using mini k-means."""
16
+ blocks = np.asarray(blocks, dtype=np.float32)
17
+ if blocks.size == 0:
18
+ return np.empty((0, blocks.shape[-1]), dtype=np.float32)
19
+ if n_prototypes <= 0:
20
+ raise ValueError("n_prototypes must be positive")
21
+
22
+ prototypes = _init_prototypes(blocks, n_prototypes, seed)
23
+ for _ in range(5):
24
+ distances = np.linalg.norm(blocks[:, None, :] - prototypes[None, :, :], axis=2)
25
+ assignments = np.argmin(distances, axis=1)
26
+ new_prototypes = np.empty_like(prototypes)
27
+ for idx in range(len(prototypes)):
28
+ members = blocks[assignments == idx]
29
+ if len(members) == 0:
30
+ new_prototypes[idx] = prototypes[idx]
31
+ else:
32
+ new_prototypes[idx] = members.mean(axis=0)
33
+ if np.allclose(new_prototypes, prototypes):
34
+ break
35
+ prototypes = new_prototypes
36
+ return prototypes
37
+
38
+
39
+ def assign_prototypes(blocks: np.ndarray, prototypes: np.ndarray) -> np.ndarray:
40
+ """Assign each block to its nearest prototype."""
41
+ blocks = np.asarray(blocks, dtype=np.float32)
42
+ prototypes = np.asarray(prototypes, dtype=np.float32)
43
+ if len(prototypes) == 0:
44
+ return np.zeros(len(blocks), dtype=np.int32)
45
+ distances = np.linalg.norm(blocks[:, None, :] - prototypes[None, :, :], axis=2)
46
+ return np.argmin(distances, axis=1).astype(np.int32)
47
+
48
+
49
+ def reconstruct_from_prototypes(
50
+ assignments: np.ndarray, prototypes: np.ndarray, scales: np.ndarray | None = None
51
+ ) -> np.ndarray:
52
+ """Reconstruct blocks from prototype assignments."""
53
+ assignments = np.asarray(assignments, dtype=np.int32)
54
+ prototypes = np.asarray(prototypes, dtype=np.float32)
55
+ if len(prototypes) == 0:
56
+ return np.empty((len(assignments), 0), dtype=np.float32)
57
+ reconstructed = prototypes[assignments]
58
+ if scales is not None:
59
+ return reconstructed * np.asarray(scales, dtype=np.float32)[:, None]
60
+ return reconstructed
hyperglyph/py.typed ADDED
File without changes
hyperglyph/residual.py ADDED
@@ -0,0 +1,49 @@
1
+ """Sparse residual helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+
9
+
10
+ def compute_topk_residual(
11
+ original_block: np.ndarray, reconstructed_block: np.ndarray, k: int
12
+ ) -> dict[str, Any]:
13
+ """Return the indices and values of the top-k residual entries."""
14
+ if k < 0:
15
+ raise ValueError("k must be non-negative")
16
+ if original_block.shape != reconstructed_block.shape:
17
+ raise ValueError("blocks must have the same shape")
18
+
19
+ diff = np.asarray(original_block, dtype=np.float32) - np.asarray(
20
+ reconstructed_block, dtype=np.float32
21
+ )
22
+ if k == 0:
23
+ return {"indices": [], "values": []}
24
+
25
+ if diff.size == 0:
26
+ return {"indices": [], "values": []}
27
+
28
+ flat = diff.reshape(-1)
29
+ topk_idx = np.argsort(np.abs(flat))[-k:][::-1]
30
+ return {
31
+ "indices": [int(index) for index in topk_idx],
32
+ "values": [float(flat[index]) for index in topk_idx],
33
+ }
34
+
35
+
36
+ def apply_residual(block: np.ndarray, residual: dict[str, Any]) -> np.ndarray:
37
+ """Apply sparse residual values to a block."""
38
+ result = np.asarray(block, dtype=np.float32).reshape(-1).copy()
39
+ for index, value in zip(residual.get("indices", []), residual.get("values", [])):
40
+ result[int(index)] += float(value)
41
+ return result.reshape(block.shape)
42
+
43
+
44
+ def serialize_residual(residual: dict[str, Any]) -> dict[str, Any]:
45
+ """Serialize residual metadata for JSON compatibility."""
46
+ return {
47
+ "indices": list(residual.get("indices", [])),
48
+ "values": list(residual.get("values", [])),
49
+ }