hsl-embedding 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ """HSL — Holistic Signal Language: a non-learned, byte-level signal embedding (codec + encoder in one).
2
+
3
+ Everything is information — a fluctuation between 0 and 1. HSL turns raw bytes into a compact,
4
+ *change-rate-based* feature signal that any modality (text, image, audio, video, sensor) shares,
5
+ with no tokenizer and no learned parameters. The representation is grounded in a lossless codec,
6
+ so `decode(encode(x)) == x` — the substrate is byte-exact by construction.
7
+
8
+ The 21-D per-byte feature (FEAT_DIM) — the pure change-rate substrate:
9
+ dxor0..7 (8) Δ change-rate — XOR-delta from origin 0 (losslessly encodes the bytes)
10
+ d2xor0..7 (8) Δ² change-rate-of-change-rate — 2nd XOR-delta
11
+ boundary (1) byte-boundary evidence (|Δ| + 0.5|Δ²| + 0.25·HF)
12
+ fft_low/high (2) per-byte spectral amplitude bands
13
+ phase_cos/sin (2) exact complex phasor z = e^{iθ}, θ = 2π·byte/256
14
+
15
+ The raw 8 bits are NOT included by default: Δ-from-origin-0 already encodes the bytes losslessly,
16
+ so the bits are redundant. Pass include_bits=True for the legacy 29-D (raw bits prepended).
17
+
18
+ import hsl_embedding as hsl
19
+ feats, phase = hsl.embed(b"hello") # [L, 21], [L]
20
+ emb = hsl.Embedding(); feats = emb(b"hello")
21
+ assert hsl.decode(hsl.encode(b"hello")) == b"hello"
22
+
23
+ Author: Jinhyun Woo (ggunio5782@gmail.com). MIT-licensed; no learned weights included.
24
+ """
25
+ from __future__ import annotations
26
+ import math
27
+ from dataclasses import dataclass
28
+ from typing import Iterable
29
+
30
+ import numpy as np
31
+ import torch
32
+ import torch.nn as nn
33
+
34
+ __all__ = ["FEAT_DIM", "FEAT_DIM_FULL", "FEAT_NAMES", "FEAT_NAMES_FULL", "feat_names",
35
+ "ORIGIN_BIT", "CLOSURE_BIT", "HSLFrame", "encode", "decode", "embed", "Embedding"]
36
+
37
+ ORIGIN_BIT = 0 # the "0": origin enabling lossless reconstruction
38
+ CLOSURE_BIT = 1 # the "1": learned closure / end-of-content endpoint
39
+ FEAT_DIM = 21 # default: the pure change-rate substrate (Δ, Δ², boundary, Fourier, phase)
40
+ FEAT_DIM_FULL = 29 # include_bits=True: also prepend the 8 raw bits (redundant with Δ; legacy compat)
41
+ _BITS = [f"bit{i}" for i in range(8)]
42
+ _REST = ([f"dxor{i}" for i in range(8)] # Δ
43
+ + [f"d2xor{i}" for i in range(8)] # Δ²
44
+ + ["boundary", "fft_low_ratio", "fft_high_ratio", "phase_cos", "phase_sin"])
45
+ FEAT_NAMES = list(_REST) # 21 (default — change-rate substrate)
46
+ FEAT_NAMES_FULL = _BITS + _REST # 29 (with raw bits)
47
+
48
+
49
+ def feat_names(include_bits: bool = False):
50
+ return (_BITS + _REST) if include_bits else list(_REST)
51
+
52
+
53
+ # ───────────────────────────── codec (numpy, lossless) ─────────────────────────────
54
+ @dataclass(frozen=True)
55
+ class HSLFrame:
56
+ payload_len_bytes: int
57
+ bits: np.ndarray
58
+ signal: np.ndarray # bits + closure (the 0 → … → 1 journey)
59
+ delta: np.ndarray # Δ (XOR-delta from origin 0)
60
+ delta2: np.ndarray # Δ²
61
+ byte_boundary_score: np.ndarray
62
+
63
+
64
+ def _bytes_to_bits(data: bytes) -> np.ndarray:
65
+ if not data:
66
+ return np.zeros((0,), dtype=np.uint8)
67
+ return np.unpackbits(np.frombuffer(data, dtype=np.uint8), bitorder="big").astype(np.uint8)
68
+
69
+
70
+ def _bits_to_bytes(bits: np.ndarray, n: int) -> bytes:
71
+ need = n * 8
72
+ b = np.asarray(bits[:need], dtype=np.uint8)
73
+ if b.size != need:
74
+ raise ValueError(f"not enough bits: have {b.size}, need {need}")
75
+ return np.packbits(b, bitorder="big")[:n].tobytes() if b.size else b""
76
+
77
+
78
+ def _xor_delta(bits: np.ndarray, origin: int = ORIGIN_BIT) -> np.ndarray:
79
+ bits = np.asarray(bits, dtype=np.uint8)
80
+ prev = np.empty_like(bits)
81
+ if bits.size:
82
+ prev[0] = origin
83
+ prev[1:] = bits[:-1]
84
+ return np.bitwise_xor(bits, prev).astype(np.uint8)
85
+
86
+
87
+ def _integrate(delta: Iterable[int], origin: int = ORIGIN_BIT) -> np.ndarray:
88
+ out, prev = [], np.uint8(origin)
89
+ for v in delta:
90
+ prev = np.uint8(v) ^ prev
91
+ out.append(prev)
92
+ return np.asarray(out, dtype=np.uint8)
93
+
94
+
95
+ def _hf_energy(values: np.ndarray, radius: int = 4) -> np.ndarray:
96
+ values = np.asarray(values, dtype=np.float32)
97
+ if values.size == 0:
98
+ return values
99
+ k = np.ones((radius * 2 + 1,), dtype=np.float32) / float(radius * 2 + 1)
100
+ return np.abs(values - np.convolve(values, k, mode="same")).astype(np.float32)
101
+
102
+
103
+ def _byte_boundary(signal: np.ndarray) -> np.ndarray:
104
+ """vectorized per-byte boundary score from transition energy (no UTF-8 decoding)."""
105
+ if signal.size <= 1:
106
+ return np.zeros((0,), dtype=np.float32)
107
+ nbytes = (signal.size - 1) // 8
108
+ if nbytes == 0:
109
+ return np.zeros((0,), dtype=np.float32)
110
+ energy = _xor_delta(signal).astype(np.float32) + 0.5 * _xor_delta(_xor_delta(signal)).astype(np.float32)
111
+ starts = np.arange(nbytes) * 8
112
+ lo = np.maximum(0, starts - 4)
113
+ hi = np.minimum(energy.size, starts + 5)
114
+ csum = np.concatenate([[0.0], np.cumsum(energy)])
115
+ return ((csum[hi] - csum[lo]) / np.maximum(hi - lo, 1)).astype(np.float32)
116
+
117
+
118
+ def encode(data: bytes) -> HSLFrame:
119
+ """bytes → HSLFrame (lossless; carries the bitstream, Δ, Δ²)."""
120
+ bits = _bytes_to_bits(data)
121
+ signal = np.concatenate([bits, np.asarray([CLOSURE_BIT], dtype=np.uint8)])
122
+ delta = _xor_delta(signal)
123
+ return HSLFrame(len(data), bits, signal, delta, _xor_delta(delta), _byte_boundary(signal))
124
+
125
+
126
+ def decode(frame: HSLFrame) -> bytes:
127
+ """HSLFrame → bytes (integrate Δ from origin 0, check closure, drop it)."""
128
+ signal = _integrate(frame.delta, origin=ORIGIN_BIT)
129
+ need = frame.payload_len_bytes * 8 + 1
130
+ if signal.size < need or int(signal[need - 1]) != CLOSURE_BIT:
131
+ raise ValueError("closure / length check failed")
132
+ return _bits_to_bytes(signal[: need - 1], frame.payload_len_bytes)
133
+
134
+
135
+ # ───────────────────────────── embedding (torch) ─────────────────────────────
136
+ def embed(data: bytes, include_bits: bool = False):
137
+ """bytes → (feats [L, 21|29], phase [L]). Deterministic, non-learned, pure change-rate.
138
+
139
+ include_bits=False (default, 21-D): the change-rate substrate — Δ, Δ², boundary, Fourier, phase.
140
+ The raw byte bits are dropped because Δ-from-origin-0 already encodes them losslessly.
141
+ include_bits=True (29-D): also prepend the 8 raw bits — for the original trained HoLo model.
142
+ """
143
+ if len(data) == 0:
144
+ data = b"\x00"
145
+ fr = encode(data)
146
+ bc = fr.payload_len_bytes
147
+ bits = torch.from_numpy(fr.bits[: bc * 8].reshape(bc, 8).astype(np.float32))
148
+ dxor = torch.from_numpy(fr.delta[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ
149
+ d2xor = torch.from_numpy(fr.delta2[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ²
150
+ boundary = torch.from_numpy(fr.byte_boundary_score.reshape(bc, 1).astype(np.float32))
151
+
152
+ spec = torch.fft.rfft(bits, dim=1).abs() # per-byte 8-bit spectrum
153
+ low, high = spec[:, :3].sum(1, keepdim=True), spec[:, 3:].sum(1, keepdim=True)
154
+ fourier = torch.cat([low, high], dim=1) / (low + high + 1e-6)
155
+
156
+ a = torch.from_numpy(np.frombuffer(data, dtype=np.uint8).astype(np.float32))
157
+ angle = a / 256.0 * (2.0 * math.pi) # exact phase θ
158
+ phasor = torch.stack([torch.cos(angle), torch.sin(angle)], dim=1)
159
+
160
+ parts = [bits, dxor, d2xor, boundary, fourier, phasor] if include_bits else [dxor, d2xor, boundary, fourier, phasor]
161
+ return torch.cat(parts, dim=1), angle # [bc, 29] or [bc, 21]
162
+
163
+
164
+ class Embedding(nn.Module):
165
+ """HSL byte → signal embedding as an nn.Module (no parameters), usable like nn.Embedding.
166
+
167
+ self.hsl = hsl.Embedding()
168
+ feats = self.hsl(b"...") # [L, 21] (include_bits=True -> [L, 29])
169
+ feats, phase = self.hsl(b"...", return_phase=True)
170
+ """
171
+ def __init__(self, include_bits: bool = False):
172
+ super().__init__()
173
+ self.include_bits = include_bits
174
+ self.out_dim = FEAT_DIM_FULL if include_bits else FEAT_DIM
175
+
176
+ def forward(self, data: bytes, return_phase: bool = False):
177
+ feats, phase = embed(data, self.include_bits)
178
+ return (feats, phase) if return_phase else feats
179
+
180
+ def pack(self, byte_list: list[bytes], max_len: int):
181
+ """list[bytes] → feats[B,L,out_dim], phase[B,L], mask[B,L] (pad/truncate to max_len)."""
182
+ B = len(byte_list)
183
+ feats = torch.zeros(B, max_len, self.out_dim)
184
+ phase = torch.zeros(B, max_len)
185
+ mask = torch.zeros(B, max_len)
186
+ for i, data in enumerate(byte_list):
187
+ f, p = embed(data, self.include_bits)
188
+ n = min(f.shape[0], max_len)
189
+ feats[i, :n], phase[i, :n], mask[i, :n] = f[:n], p[:n], 1.0
190
+ return feats, phase, mask
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: hsl-embedding
3
+ Version: 0.1.0
4
+ Summary: HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible.
5
+ Project-URL: Homepage, https://github.com/Woojiggun/holo-hsl
6
+ Project-URL: Paper, https://doi.org/10.5281/zenodo.20581805
7
+ Project-URL: Demo, https://holo-demo-p5txmh4dda-as.a.run.app
8
+ Author-email: Jinhyun Woo <ggunio5782@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: byte-native,change-rate,embedding,multimodal,pytorch,signal,tokenizer-free
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.9
18
+ Requires-Dist: numpy>=1.21
19
+ Requires-Dist: torch>=1.12
20
+ Description-Content-Type: text/markdown
21
+
22
+ # HSL — Holistic Signal Language
23
+
24
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.20581805.svg)](https://doi.org/10.5281/zenodo.20581805)
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
26
+
27
+ **A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
28
+ raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
29
+ Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
30
+ modality-agnostic input layer for text, image, audio, video — any byte stream.
31
+
32
+ > Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
33
+ > measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
34
+
35
+ ```python
36
+ import hsl_embedding as hsl
37
+
38
+ feats, phase = hsl.embed(b"hello") # -> Tensor [L, 21], Tensor [L]
39
+ emb = hsl.Embedding() # an nn.Module, no parameters (like nn.Embedding)
40
+ feats = emb("강아지".encode()) # -> [L, 21]
41
+ assert hsl.decode(hsl.encode(b"hello")) == b"hello" # lossless, by construction
42
+ ```
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ pip install hsl-embedding # distribution name; import as `import hsl_embedding as hsl`
48
+ # deps: numpy, torch
49
+ ```
50
+
51
+ ## Why not just `nn.Embedding`?
52
+
53
+ They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
54
+
55
+ | | `torch.nn.Embedding` | `hsl.Embedding` |
56
+ |---|---|---|
57
+ | what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
58
+ | input | a token id (`int`) | raw `bytes` |
59
+ | needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
60
+ | dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
61
+ | modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
62
+ | invertible | no | **yes** (`decode(encode(x)) == x`) |
63
+ | new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
64
+
65
+ **They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
66
+ learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
67
+ HSL features.
68
+
69
+ **Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
70
+ features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
71
+
72
+ ## What each channel captures (and where it's good)
73
+
74
+ HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
75
+ throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
76
+
77
+ | channel (dims) | exact formula | captures | especially good for |
78
+ |---|---|---|---|
79
+ | **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
80
+ | **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
81
+ | **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
82
+ | **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
83
+ | **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
84
+
85
+ The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
86
+ spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
87
+ modality needs them.
88
+
89
+ *Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
90
+ already encodes the bytes losslessly), included only to match the original trained HoLo model.
91
+
92
+ ## Lossless by construction
93
+
94
+ The features are grounded in a lossless codec, so the substrate is byte-exact:
95
+
96
+ ```python
97
+ frame = hsl.encode(b"any bytes \x00\xff")
98
+ hsl.decode(frame) == b"any bytes \x00\xff" # True
99
+ ```
100
+ Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
101
+ raw `bits` channel is redundant and can be dropped.
102
+
103
+ ## 21-D (default) vs 29-D (legacy)
104
+
105
+ ```python
106
+ hsl.embed(data) # 21-D (default; pure change-rate, no redundant bits)
107
+ hsl.embed(data, include_bits=True) # 29-D (also prepend the 8 raw bits — original HoLo model)
108
+ hsl.Embedding(include_bits=True).out_dim # 29
109
+ ```
110
+
111
+ ## Batch
112
+
113
+ ```python
114
+ emb = hsl.Embedding()
115
+ feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8) # [B, L, D], [B, L], [B, L]
116
+ ```
117
+
118
+ ## Examples
119
+
120
+ ```bash
121
+ python examples/quickstart.py # bytes in, features out; named channels
122
+ python examples/roundtrip_all.py # text / image / audio / video -> embed -> EXACT reconstruction
123
+ python examples/vs_nn_embedding.py # nn.Embedding vs hsl.Embedding — when to use which
124
+ python examples/benchmark_vs_nn.py # honest capability + speed comparison
125
+ ```
126
+
127
+ `roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
128
+
129
+ ```
130
+ modality bytes feat shape reconstruction
131
+ ----------------------------------------------------------------
132
+ text (utf-8) 98 (98, 21) EXACT ✓
133
+ image (RGB u8) 3072 (3072, 21) EXACT ✓
134
+ audio (PCM i16) 8000 (8000, 21) EXACT ✓
135
+ video (6 frames) 4608 (4608, 21) EXACT ✓
136
+ ```
137
+
138
+ ## Scope (honest)
139
+
140
+ HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
141
+ benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
142
+ top. See the paper and live demo:
143
+
144
+ - 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
145
+ - 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
146
+ - 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
147
+
148
+ ## License & citation
149
+
150
+ **MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
151
+ Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
152
+ copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
153
+
154
+ ```bibtex
155
+ @software{woo_hsl_2026,
156
+ author = {Jinhyun Woo},
157
+ title = {HSL: a byte-native, modality-agnostic signal embedding},
158
+ year = {2026},
159
+ doi = {10.5281/zenodo.20581805},
160
+ url = {https://github.com/Woojiggun/holo-hsl}
161
+ }
162
+ ```
@@ -0,0 +1,5 @@
1
+ hsl_embedding/__init__.py,sha256=iZ6lUOGuAKHpJyFrbb8hIyzvjkPwcVrTYCiSRCyY2_w,9096
2
+ hsl_embedding-0.1.0.dist-info/METADATA,sha256=6VbfCkXpZpZnsAutXsOEOLYBI3CjBOfgYQe_w9dLo38,8133
3
+ hsl_embedding-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
4
+ hsl_embedding-0.1.0.dist-info/licenses/LICENSE,sha256=9lzK2PkfouwlysrhAYqYUVNv0BOPu0EzNyw_tA1R8mY,1068
5
+ hsl_embedding-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jinhyun Woo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.