hsl-embedding 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hsl_embedding-0.1.0/.gitignore +19 -0
- hsl_embedding-0.1.0/LICENSE +21 -0
- hsl_embedding-0.1.0/PKG-INFO +162 -0
- hsl_embedding-0.1.0/README.md +141 -0
- hsl_embedding-0.1.0/examples/benchmark_vs_nn.py +70 -0
- hsl_embedding-0.1.0/examples/quickstart.py +23 -0
- hsl_embedding-0.1.0/examples/roundtrip_all.py +50 -0
- hsl_embedding-0.1.0/examples/vs_nn_embedding.py +45 -0
- hsl_embedding-0.1.0/hsl_embedding/__init__.py +190 -0
- hsl_embedding-0.1.0/pyproject.toml +29 -0
- hsl_embedding-0.1.0/tests/test_hsl.py +62 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jinhyun Woo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hsl-embedding
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Woojiggun/holo-hsl
|
|
6
|
+
Project-URL: Paper, https://doi.org/10.5281/zenodo.20581805
|
|
7
|
+
Project-URL: Demo, https://holo-demo-p5txmh4dda-as.a.run.app
|
|
8
|
+
Author-email: Jinhyun Woo <ggunio5782@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: byte-native,change-rate,embedding,multimodal,pytorch,signal,tokenizer-free
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Requires-Dist: numpy>=1.21
|
|
19
|
+
Requires-Dist: torch>=1.12
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# HSL — Holistic Signal Language
|
|
23
|
+
|
|
24
|
+
[](https://doi.org/10.5281/zenodo.20581805)
|
|
25
|
+
[](LICENSE)
|
|
26
|
+
|
|
27
|
+
**A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
|
|
28
|
+
raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
|
|
29
|
+
Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
|
|
30
|
+
modality-agnostic input layer for text, image, audio, video — any byte stream.
|
|
31
|
+
|
|
32
|
+
> Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
|
|
33
|
+
> measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import hsl_embedding as hsl
|
|
37
|
+
|
|
38
|
+
feats, phase = hsl.embed(b"hello") # -> Tensor [L, 21], Tensor [L]
|
|
39
|
+
emb = hsl.Embedding() # an nn.Module, no parameters (like nn.Embedding)
|
|
40
|
+
feats = emb("강아지".encode()) # -> [L, 21]
|
|
41
|
+
assert hsl.decode(hsl.encode(b"hello")) == b"hello" # lossless, by construction
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install hsl-embedding # distribution name; import as `import hsl_embedding as hsl`
|
|
48
|
+
# deps: numpy, torch
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Why not just `nn.Embedding`?
|
|
52
|
+
|
|
53
|
+
They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
|
|
54
|
+
|
|
55
|
+
| | `torch.nn.Embedding` | `hsl.Embedding` |
|
|
56
|
+
|---|---|---|
|
|
57
|
+
| what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
|
|
58
|
+
| input | a token id (`int`) | raw `bytes` |
|
|
59
|
+
| needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
|
|
60
|
+
| dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
|
|
61
|
+
| modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
|
|
62
|
+
| invertible | no | **yes** (`decode(encode(x)) == x`) |
|
|
63
|
+
| new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
|
|
64
|
+
|
|
65
|
+
**They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
|
|
66
|
+
learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
|
|
67
|
+
HSL features.
|
|
68
|
+
|
|
69
|
+
**Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
|
|
70
|
+
features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
|
|
71
|
+
|
|
72
|
+
## What each channel captures (and where it's good)
|
|
73
|
+
|
|
74
|
+
HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
|
|
75
|
+
throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
|
|
76
|
+
|
|
77
|
+
| channel (dims) | exact formula | captures | especially good for |
|
|
78
|
+
|---|---|---|---|
|
|
79
|
+
| **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
|
|
80
|
+
| **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
|
|
81
|
+
| **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
|
|
82
|
+
| **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
|
|
83
|
+
| **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
|
|
84
|
+
|
|
85
|
+
The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
|
|
86
|
+
spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
|
|
87
|
+
modality needs them.
|
|
88
|
+
|
|
89
|
+
*Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
|
|
90
|
+
already encodes the bytes losslessly), included only to match the original trained HoLo model.
|
|
91
|
+
|
|
92
|
+
## Lossless by construction
|
|
93
|
+
|
|
94
|
+
The features are grounded in a lossless codec, so the substrate is byte-exact:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
frame = hsl.encode(b"any bytes \x00\xff")
|
|
98
|
+
hsl.decode(frame) == b"any bytes \x00\xff" # True
|
|
99
|
+
```
|
|
100
|
+
Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
|
|
101
|
+
raw `bits` channel is redundant and can be dropped.
|
|
102
|
+
|
|
103
|
+
## 21-D (default) vs 29-D (legacy)
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
hsl.embed(data) # 21-D (default; pure change-rate, no redundant bits)
|
|
107
|
+
hsl.embed(data, include_bits=True) # 29-D (also prepend the 8 raw bits — original HoLo model)
|
|
108
|
+
hsl.Embedding(include_bits=True).out_dim # 29
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Batch
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
emb = hsl.Embedding()
|
|
115
|
+
feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8) # [B, L, D], [B, L], [B, L]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Examples
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python examples/quickstart.py # bytes in, features out; named channels
|
|
122
|
+
python examples/roundtrip_all.py # text / image / audio / video -> embed -> EXACT reconstruction
|
|
123
|
+
python examples/vs_nn_embedding.py # nn.Embedding vs hsl.Embedding — when to use which
|
|
124
|
+
python examples/benchmark_vs_nn.py # honest capability + speed comparison
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
`roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
modality bytes feat shape reconstruction
|
|
131
|
+
----------------------------------------------------------------
|
|
132
|
+
text (utf-8) 98 (98, 21) EXACT ✓
|
|
133
|
+
image (RGB u8) 3072 (3072, 21) EXACT ✓
|
|
134
|
+
audio (PCM i16) 8000 (8000, 21) EXACT ✓
|
|
135
|
+
video (6 frames) 4608 (4608, 21) EXACT ✓
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Scope (honest)
|
|
139
|
+
|
|
140
|
+
HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
|
|
141
|
+
benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
|
|
142
|
+
top. See the paper and live demo:
|
|
143
|
+
|
|
144
|
+
- 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
|
|
145
|
+
- 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
|
|
146
|
+
- 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
|
|
147
|
+
|
|
148
|
+
## License & citation
|
|
149
|
+
|
|
150
|
+
**MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
|
|
151
|
+
Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
|
|
152
|
+
copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
|
|
153
|
+
|
|
154
|
+
```bibtex
|
|
155
|
+
@software{woo_hsl_2026,
|
|
156
|
+
author = {Jinhyun Woo},
|
|
157
|
+
title = {HSL: a byte-native, modality-agnostic signal embedding},
|
|
158
|
+
year = {2026},
|
|
159
|
+
doi = {10.5281/zenodo.20581805},
|
|
160
|
+
url = {https://github.com/Woojiggun/holo-hsl}
|
|
161
|
+
}
|
|
162
|
+
```
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# HSL — Holistic Signal Language
|
|
2
|
+
|
|
3
|
+
[](https://doi.org/10.5281/zenodo.20581805)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
**A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
|
|
7
|
+
raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
|
|
8
|
+
Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
|
|
9
|
+
modality-agnostic input layer for text, image, audio, video — any byte stream.
|
|
10
|
+
|
|
11
|
+
> Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
|
|
12
|
+
> measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
import hsl_embedding as hsl
|
|
16
|
+
|
|
17
|
+
feats, phase = hsl.embed(b"hello") # -> Tensor [L, 21], Tensor [L]
|
|
18
|
+
emb = hsl.Embedding() # an nn.Module, no parameters (like nn.Embedding)
|
|
19
|
+
feats = emb("강아지".encode()) # -> [L, 21]
|
|
20
|
+
assert hsl.decode(hsl.encode(b"hello")) == b"hello" # lossless, by construction
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install hsl-embedding # distribution name; import as `import hsl_embedding as hsl`
|
|
27
|
+
# deps: numpy, torch
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Why not just `nn.Embedding`?
|
|
31
|
+
|
|
32
|
+
They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
|
|
33
|
+
|
|
34
|
+
| | `torch.nn.Embedding` | `hsl.Embedding` |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
|
|
37
|
+
| input | a token id (`int`) | raw `bytes` |
|
|
38
|
+
| needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
|
|
39
|
+
| dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
|
|
40
|
+
| modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
|
|
41
|
+
| invertible | no | **yes** (`decode(encode(x)) == x`) |
|
|
42
|
+
| new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
|
|
43
|
+
|
|
44
|
+
**They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
|
|
45
|
+
learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
|
|
46
|
+
HSL features.
|
|
47
|
+
|
|
48
|
+
**Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
|
|
49
|
+
features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
|
|
50
|
+
|
|
51
|
+
## What each channel captures (and where it's good)
|
|
52
|
+
|
|
53
|
+
HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
|
|
54
|
+
throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
|
|
55
|
+
|
|
56
|
+
| channel (dims) | exact formula | captures | especially good for |
|
|
57
|
+
|---|---|---|---|
|
|
58
|
+
| **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
|
|
59
|
+
| **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
|
|
60
|
+
| **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
|
|
61
|
+
| **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
|
|
62
|
+
| **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
|
|
63
|
+
|
|
64
|
+
The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
|
|
65
|
+
spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
|
|
66
|
+
modality needs them.
|
|
67
|
+
|
|
68
|
+
*Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
|
|
69
|
+
already encodes the bytes losslessly), included only to match the original trained HoLo model.
|
|
70
|
+
|
|
71
|
+
## Lossless by construction
|
|
72
|
+
|
|
73
|
+
The features are grounded in a lossless codec, so the substrate is byte-exact:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
frame = hsl.encode(b"any bytes \x00\xff")
|
|
77
|
+
hsl.decode(frame) == b"any bytes \x00\xff" # True
|
|
78
|
+
```
|
|
79
|
+
Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
|
|
80
|
+
raw `bits` channel is redundant and can be dropped.
|
|
81
|
+
|
|
82
|
+
## 21-D (default) vs 29-D (legacy)
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
hsl.embed(data) # 21-D (default; pure change-rate, no redundant bits)
|
|
86
|
+
hsl.embed(data, include_bits=True) # 29-D (also prepend the 8 raw bits — original HoLo model)
|
|
87
|
+
hsl.Embedding(include_bits=True).out_dim # 29
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Batch
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
emb = hsl.Embedding()
|
|
94
|
+
feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8) # [B, L, D], [B, L], [B, L]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Examples
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python examples/quickstart.py # bytes in, features out; named channels
|
|
101
|
+
python examples/roundtrip_all.py # text / image / audio / video -> embed -> EXACT reconstruction
|
|
102
|
+
python examples/vs_nn_embedding.py # nn.Embedding vs hsl.Embedding — when to use which
|
|
103
|
+
python examples/benchmark_vs_nn.py # honest capability + speed comparison
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
`roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
modality bytes feat shape reconstruction
|
|
110
|
+
----------------------------------------------------------------
|
|
111
|
+
text (utf-8) 98 (98, 21) EXACT ✓
|
|
112
|
+
image (RGB u8) 3072 (3072, 21) EXACT ✓
|
|
113
|
+
audio (PCM i16) 8000 (8000, 21) EXACT ✓
|
|
114
|
+
video (6 frames) 4608 (4608, 21) EXACT ✓
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Scope (honest)
|
|
118
|
+
|
|
119
|
+
HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
|
|
120
|
+
benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
|
|
121
|
+
top. See the paper and live demo:
|
|
122
|
+
|
|
123
|
+
- 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
|
|
124
|
+
- 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
|
|
125
|
+
- 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
|
|
126
|
+
|
|
127
|
+
## License & citation
|
|
128
|
+
|
|
129
|
+
**MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
|
|
130
|
+
Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
|
|
131
|
+
copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
|
|
132
|
+
|
|
133
|
+
```bibtex
|
|
134
|
+
@software{woo_hsl_2026,
|
|
135
|
+
author = {Jinhyun Woo},
|
|
136
|
+
title = {HSL: a byte-native, modality-agnostic signal embedding},
|
|
137
|
+
year = {2026},
|
|
138
|
+
doi = {10.5281/zenodo.20581805},
|
|
139
|
+
url = {https://github.com/Woojiggun/holo-hsl}
|
|
140
|
+
}
|
|
141
|
+
```
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""HSL vs torch.nn.Embedding — an honest benchmark (capabilities + a few real measurements).
|
|
2
|
+
|
|
3
|
+
This is NOT a "HSL is better" pitch. They are different tools:
|
|
4
|
+
nn.Embedding is a fast learned lookup table; HSL is an exact, invertible, modality-agnostic signal.
|
|
5
|
+
We report what each *can* and *cannot* do, and we're upfront that nn.Embedding is faster at raw lookup.
|
|
6
|
+
"""
|
|
7
|
+
import time
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
import hsl_embedding as hsl
|
|
12
|
+
|
|
13
|
+
blob = (np.random.RandomState(0).rand(20000) * 256).astype(np.uint8).tobytes() # 20 KB of bytes
|
|
14
|
+
ids = torch.tensor(list(blob))
|
|
15
|
+
D = hsl.FEAT_DIM # 21 (default change-rate substrate)
|
|
16
|
+
|
|
17
|
+
nn_emb = nn.Embedding(256, D) # smallest fair vocab = 256 byte values
|
|
18
|
+
hsl_emb = hsl.Embedding()
|
|
19
|
+
|
|
20
|
+
# ---- 1) capability matrix -------------------------------------------------------------
|
|
21
|
+
def yn(b): return "yes ✓" if b else "no ✗"
|
|
22
|
+
print("capability nn.Embedding hsl.Embedding")
|
|
23
|
+
print("-" * 70)
|
|
24
|
+
rows = [
|
|
25
|
+
("learnable parameters", f"{256*D:,} (trained)", "0 (formula)"),
|
|
26
|
+
("needs a tokenizer / vocab", "yes", "no (raw bytes)"),
|
|
27
|
+
("meaningful before any training", "no ✗", "yes ✓"),
|
|
28
|
+
("one encoder across modalities", "no ✗ (per-modality)", "yes ✓"),
|
|
29
|
+
("handles any of 256 byte values", "only if in vocab", "yes ✓ (all)"),
|
|
30
|
+
("invertible (reconstruct input)", "no ✗", "yes ✓ (lossless)"),
|
|
31
|
+
("interpretable dims", "no ✗ (opaque)", "yes ✓ (Δ/Δ²/FFT/phase)"),
|
|
32
|
+
]
|
|
33
|
+
for a, b, c in rows:
|
|
34
|
+
print(f"{a:34} {b:19} {c}")
|
|
35
|
+
|
|
36
|
+
# ---- 2) reconstruction: can you get the input back? -----------------------------------
|
|
37
|
+
restored = hsl.decode(hsl.encode(blob))
|
|
38
|
+
print(f"\nreconstruction error HSL: {0 if restored == blob else 1}.0 (exact) "
|
|
39
|
+
f"nn.Embedding: N/A (a learned vector cannot be inverted to the input)")
|
|
40
|
+
|
|
41
|
+
# ---- 3) unseen value: nn.Embedding with a smaller vocab breaks; HSL never does --------
|
|
42
|
+
small = nn.Embedding(128, D) # vocab only covers bytes 0..127
|
|
43
|
+
try:
|
|
44
|
+
small(torch.tensor([200])) # byte 200 -> out of range
|
|
45
|
+
nn_ok = True
|
|
46
|
+
except Exception:
|
|
47
|
+
nn_ok = False
|
|
48
|
+
hsl.embed(bytes([200])) # always fine
|
|
49
|
+
print(f"unseen byte (200) with vocab=128 nn.Embedding: {'ok' if nn_ok else 'IndexError ✗'} HSL: ok ✓")
|
|
50
|
+
|
|
51
|
+
# ---- 4) throughput: a one-time input transform (NOT a like-for-like race) -------------
|
|
52
|
+
# nn.Embedding does a memory lookup; HSL *computes* an exact signal. These are different jobs,
|
|
53
|
+
# so this is not a fair head-to-head — HSL is a feature transform you run once and cache, the way
|
|
54
|
+
# you would any preprocessing. We report its throughput for context, not as a competition.
|
|
55
|
+
hsl_emb(blob) # warm up
|
|
56
|
+
t = time.perf_counter()
|
|
57
|
+
for _ in range(20):
|
|
58
|
+
hsl_emb(blob)
|
|
59
|
+
mbps = 20 / ((time.perf_counter() - t) / 20) / 1024
|
|
60
|
+
print(f"\nHSL feature-extraction throughput: ~{mbps:.1f} MB/s (one-time transform; cache and reuse)")
|
|
61
|
+
print("nn.Embedding is a table lookup, not a signal computation — speed isn't a meaningful comparison.")
|
|
62
|
+
|
|
63
|
+
print("""
|
|
64
|
+
Takeaway
|
|
65
|
+
--------
|
|
66
|
+
nn.Embedding -> a fast learned lookup; needs a vocab + training; one per modality.
|
|
67
|
+
hsl.Embedding -> zero params, no training, one substrate for every modality, exact & invertible,
|
|
68
|
+
interpretable channels. It computes a signal (so it's a one-time input transform,
|
|
69
|
+
not a lookup). Use HSL for the input layer; learn meaning on top.
|
|
70
|
+
""")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Quickstart — bytes in, signal features out. No tokenizer, no training."""
|
|
2
|
+
import hsl_embedding as hsl
|
|
3
|
+
|
|
4
|
+
# 1) functional: any bytes -> [L, 29] features + [L] phase
|
|
5
|
+
feats, phase = hsl.embed("변화율이 공통 언어다".encode())
|
|
6
|
+
print("feats", tuple(feats.shape), "| phase", tuple(phase.shape))
|
|
7
|
+
|
|
8
|
+
# 2) as an nn.Module (no parameters) — drop into a model like nn.Embedding
|
|
9
|
+
emb = hsl.Embedding() # or Embedding(include_bits=False) for the lean 21-D
|
|
10
|
+
print("out_dim", emb.out_dim)
|
|
11
|
+
x = emb(b"\x89PNG\r\n\x1a\n") # works on image bytes just the same
|
|
12
|
+
print("image bytes ->", tuple(x.shape))
|
|
13
|
+
|
|
14
|
+
# 3) named channels — read what each dimension means
|
|
15
|
+
names = hsl.feat_names(include_bits=True)
|
|
16
|
+
row0 = feats[0]
|
|
17
|
+
for name, val in list(zip(names, row0.tolist()))[8:24]: # Δ and Δ² channels
|
|
18
|
+
print(f" {name:8} {val:+.0f}")
|
|
19
|
+
|
|
20
|
+
# 4) lossless — the substrate is byte-exact
|
|
21
|
+
b = b"round trip \x00\xff"
|
|
22
|
+
assert hsl.decode(hsl.encode(b)) == b
|
|
23
|
+
print("lossless:", True)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""One encoder, any modality, exact reconstruction.
|
|
2
|
+
|
|
3
|
+
HSL reads text / image / audio / video as the SAME thing — bytes — and its substrate is lossless,
|
|
4
|
+
so the original comes back *exactly*. Here we embed each modality and rebuild it straight from the
|
|
5
|
+
embedding's Δ (change-rate) channel. No tokenizer, no per-modality code, no information lost.
|
|
6
|
+
(Self-contained: samples are synthesized with numpy; no extra dependencies.)
|
|
7
|
+
"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
import hsl_embedding as hsl
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def restore_from_embedding(feats: torch.Tensor) -> bytes:
|
|
14
|
+
"""Rebuild the original bytes straight from the embedding's Δ channel (dxor = first 8 dims).
|
|
15
|
+
Δ-from-origin-0 is a lossless re-encoding: cumulative-XOR integrates it back to the bytes."""
|
|
16
|
+
dxor = feats[:, 0:8].reshape(-1).round().to(torch.uint8).numpy() # the per-bit change-rate
|
|
17
|
+
bits = np.empty_like(dxor)
|
|
18
|
+
prev = 0
|
|
19
|
+
for i, v in enumerate(dxor): # integrate Δ from origin 0
|
|
20
|
+
prev ^= int(v); bits[i] = prev
|
|
21
|
+
return np.packbits(bits, bitorder="big").tobytes()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# --- synthesize one real sample per modality (as its natural raw bytes) ---------------
|
|
25
|
+
text = "변화율은 모든 모달리티의 공통 언어다. Everything is a fluctuation between 0 and 1.".encode("utf-8")
|
|
26
|
+
image = (np.add.outer(np.arange(32), np.arange(32)) % 256).astype(np.uint8) # 32x32 gradient
|
|
27
|
+
image = np.stack([image, image[::-1], image.T], -1).astype(np.uint8) # 32x32x3 RGB
|
|
28
|
+
audio = (np.sin(np.linspace(0, 50 * np.pi, 4000)) * 30000).astype(np.int16) # 4000-sample tone
|
|
29
|
+
video = (np.random.RandomState(0).rand(6, 16, 16, 3) * 255).astype(np.uint8) # 6 frames 16x16 RGB
|
|
30
|
+
|
|
31
|
+
samples = {
|
|
32
|
+
"text (utf-8)": (text, text),
|
|
33
|
+
"image (RGB u8)": (image.tobytes(), image),
|
|
34
|
+
"audio (PCM i16)": (audio.tobytes(), audio),
|
|
35
|
+
"video (6 frames)":(video.tobytes(), video),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
print(f"{'modality':18} {'bytes':>8} {'feat shape':>14} reconstruction")
|
|
39
|
+
print("-" * 64)
|
|
40
|
+
for name, (raw, original) in samples.items():
|
|
41
|
+
feats, _ = hsl.embed(raw) # ONE call, any modality -> [L, 29]
|
|
42
|
+
restored = restore_from_embedding(feats) # rebuild straight from the embedding
|
|
43
|
+
exact = restored == raw
|
|
44
|
+
# round-trip back into the modality's native array, bit-for-bit
|
|
45
|
+
if isinstance(original, np.ndarray):
|
|
46
|
+
rebuilt = np.frombuffer(restored, dtype=original.dtype).reshape(original.shape)
|
|
47
|
+
exact = exact and np.array_equal(rebuilt, original)
|
|
48
|
+
print(f"{name:18} {len(raw):>8} {str(tuple(feats.shape)):>14} {'EXACT ✓' if exact else 'MISMATCH ✗'}")
|
|
49
|
+
|
|
50
|
+
print("\nOne modality-agnostic encoder. Lossless by construction — embed, then restore the original exactly.")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""hsl.Embedding vs torch.nn.Embedding — *when to use which* (not a performance comparison).
|
|
2
|
+
|
|
3
|
+
nn.Embedding : token id -> learned vector. Needs a tokenizer + vocab + training. One per modality.
|
|
4
|
+
hsl.Embedding : raw bytes -> exact signal features. No tokenizer, no params, works across modalities.
|
|
5
|
+
They compose: stack nn layers ON TOP of HSL features.
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
import hsl_embedding as hsl
|
|
10
|
+
|
|
11
|
+
text = "강아지".encode("utf-8")
|
|
12
|
+
image_bytes = bytes([0, 0, 5, 250, 255, 250, 5, 0]) # a tiny 1-D "edge"
|
|
13
|
+
audio_bytes = bytes([128, 130, 126, 160, 96, 200, 56]) # a tiny "transient"
|
|
14
|
+
|
|
15
|
+
# --- nn.Embedding: you must first define a vocab and tokenize -------------------------
|
|
16
|
+
vocab_size = 256
|
|
17
|
+
nn_emb = nn.Embedding(vocab_size, 32) # 256*32 LEARNED params, random until trained
|
|
18
|
+
ids = torch.tensor(list(text)) # you had to choose a tokenization (here: bytes)
|
|
19
|
+
print("nn.Embedding:", tuple(nn_emb(ids).shape), "(learned, random until trained; needs a vocab)")
|
|
20
|
+
|
|
21
|
+
# --- hsl.Embedding: bytes straight in, meaningful from day one -------------------------
|
|
22
|
+
hsl_emb = hsl.Embedding() # 0 params, deterministic
|
|
23
|
+
for name, b in [("text", text), ("image", image_bytes), ("audio", audio_bytes)]:
|
|
24
|
+
feats = hsl_emb(b)
|
|
25
|
+
print(f"hsl.Embedding({name:5}): {tuple(feats.shape)} - same call, any modality")
|
|
26
|
+
|
|
27
|
+
# --- what HSL gives that a single learned vector blurs together -----------------------
|
|
28
|
+
names = hsl.feat_names() # default 21-D channel names
|
|
29
|
+
feats = hsl_emb(audio_bytes)
|
|
30
|
+
print("\naudio 'transient' - interpretable channels at byte 3:")
|
|
31
|
+
for ch in ("dxor0", "d2xor0", "boundary", "fft_high_ratio", "phase_sin"):
|
|
32
|
+
print(f" {ch:14} = {feats[3, names.index(ch)]:+.3f}")
|
|
33
|
+
|
|
34
|
+
# --- composing: HSL features -> your learned head -------------------------------------
|
|
35
|
+
head = nn.Sequential(nn.Linear(hsl_emb.out_dim, 64), nn.GELU(), nn.Linear(64, 16))
|
|
36
|
+
out = head(hsl_emb(text)) # learn meaning on top of exact signal
|
|
37
|
+
print("\nHSL -> learned head:", tuple(out.shape))
|
|
38
|
+
|
|
39
|
+
print("""
|
|
40
|
+
Rule of thumb
|
|
41
|
+
-------------
|
|
42
|
+
nn.Embedding -> fixed vocab, lots of data, you want learned semantics.
|
|
43
|
+
hsl.Embedding -> tokenizer-free, cross-modal, structure/change-aware input, exact & invertible.
|
|
44
|
+
Best together -> HSL for the input substrate, nn layers for the meaning.
|
|
45
|
+
""")
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""HSL — Holistic Signal Language: a non-learned, byte-level signal embedding (codec + encoder in one).
|
|
2
|
+
|
|
3
|
+
Everything is information — a fluctuation between 0 and 1. HSL turns raw bytes into a compact,
|
|
4
|
+
*change-rate-based* feature signal that any modality (text, image, audio, video, sensor) shares,
|
|
5
|
+
with no tokenizer and no learned parameters. The representation is grounded in a lossless codec,
|
|
6
|
+
so `decode(encode(x)) == x` — the substrate is byte-exact by construction.
|
|
7
|
+
|
|
8
|
+
The 21-D per-byte feature (FEAT_DIM) — the pure change-rate substrate:
|
|
9
|
+
dxor0..7 (8) Δ change-rate — XOR-delta from origin 0 (losslessly encodes the bytes)
|
|
10
|
+
d2xor0..7 (8) Δ² change-rate-of-change-rate — 2nd XOR-delta
|
|
11
|
+
boundary (1) byte-boundary evidence (|Δ| + 0.5|Δ²| + 0.25·HF)
|
|
12
|
+
fft_low/high (2) per-byte spectral amplitude bands
|
|
13
|
+
phase_cos/sin (2) exact complex phasor z = e^{iθ}, θ = 2π·byte/256
|
|
14
|
+
|
|
15
|
+
The raw 8 bits are NOT included by default: Δ-from-origin-0 already encodes the bytes losslessly,
|
|
16
|
+
so the bits are redundant. Pass include_bits=True for the legacy 29-D (raw bits prepended).
|
|
17
|
+
|
|
18
|
+
import hsl_embedding as hsl
|
|
19
|
+
feats, phase = hsl.embed(b"hello") # [L, 21], [L]
|
|
20
|
+
emb = hsl.Embedding(); feats = emb(b"hello")
|
|
21
|
+
assert hsl.decode(hsl.encode(b"hello")) == b"hello"
|
|
22
|
+
|
|
23
|
+
Author: Jinhyun Woo (ggunio5782@gmail.com). MIT-licensed; no learned weights included.
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
import math
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from typing import Iterable
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
import torch
|
|
32
|
+
import torch.nn as nn
|
|
33
|
+
|
|
34
|
+
__all__ = ["FEAT_DIM", "FEAT_DIM_FULL", "FEAT_NAMES", "FEAT_NAMES_FULL", "feat_names",
|
|
35
|
+
"ORIGIN_BIT", "CLOSURE_BIT", "HSLFrame", "encode", "decode", "embed", "Embedding"]
|
|
36
|
+
|
|
37
|
+
ORIGIN_BIT = 0 # the "0": origin enabling lossless reconstruction
|
|
38
|
+
CLOSURE_BIT = 1 # the "1": learned closure / end-of-content endpoint
|
|
39
|
+
FEAT_DIM = 21 # default: the pure change-rate substrate (Δ, Δ², boundary, Fourier, phase)
|
|
40
|
+
FEAT_DIM_FULL = 29 # include_bits=True: also prepend the 8 raw bits (redundant with Δ; legacy compat)
|
|
41
|
+
_BITS = [f"bit{i}" for i in range(8)]
|
|
42
|
+
_REST = ([f"dxor{i}" for i in range(8)] # Δ
|
|
43
|
+
+ [f"d2xor{i}" for i in range(8)] # Δ²
|
|
44
|
+
+ ["boundary", "fft_low_ratio", "fft_high_ratio", "phase_cos", "phase_sin"])
|
|
45
|
+
FEAT_NAMES = list(_REST) # 21 (default — change-rate substrate)
|
|
46
|
+
FEAT_NAMES_FULL = _BITS + _REST # 29 (with raw bits)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def feat_names(include_bits: bool = False):
|
|
50
|
+
return (_BITS + _REST) if include_bits else list(_REST)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ───────────────────────────── codec (numpy, lossless) ─────────────────────────────
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class HSLFrame:
|
|
56
|
+
payload_len_bytes: int
|
|
57
|
+
bits: np.ndarray
|
|
58
|
+
signal: np.ndarray # bits + closure (the 0 → … → 1 journey)
|
|
59
|
+
delta: np.ndarray # Δ (XOR-delta from origin 0)
|
|
60
|
+
delta2: np.ndarray # Δ²
|
|
61
|
+
byte_boundary_score: np.ndarray
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _bytes_to_bits(data: bytes) -> np.ndarray:
|
|
65
|
+
if not data:
|
|
66
|
+
return np.zeros((0,), dtype=np.uint8)
|
|
67
|
+
return np.unpackbits(np.frombuffer(data, dtype=np.uint8), bitorder="big").astype(np.uint8)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _bits_to_bytes(bits: np.ndarray, n: int) -> bytes:
|
|
71
|
+
need = n * 8
|
|
72
|
+
b = np.asarray(bits[:need], dtype=np.uint8)
|
|
73
|
+
if b.size != need:
|
|
74
|
+
raise ValueError(f"not enough bits: have {b.size}, need {need}")
|
|
75
|
+
return np.packbits(b, bitorder="big")[:n].tobytes() if b.size else b""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _xor_delta(bits: np.ndarray, origin: int = ORIGIN_BIT) -> np.ndarray:
|
|
79
|
+
bits = np.asarray(bits, dtype=np.uint8)
|
|
80
|
+
prev = np.empty_like(bits)
|
|
81
|
+
if bits.size:
|
|
82
|
+
prev[0] = origin
|
|
83
|
+
prev[1:] = bits[:-1]
|
|
84
|
+
return np.bitwise_xor(bits, prev).astype(np.uint8)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _integrate(delta: Iterable[int], origin: int = ORIGIN_BIT) -> np.ndarray:
|
|
88
|
+
out, prev = [], np.uint8(origin)
|
|
89
|
+
for v in delta:
|
|
90
|
+
prev = np.uint8(v) ^ prev
|
|
91
|
+
out.append(prev)
|
|
92
|
+
return np.asarray(out, dtype=np.uint8)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _hf_energy(values: np.ndarray, radius: int = 4) -> np.ndarray:
|
|
96
|
+
values = np.asarray(values, dtype=np.float32)
|
|
97
|
+
if values.size == 0:
|
|
98
|
+
return values
|
|
99
|
+
k = np.ones((radius * 2 + 1,), dtype=np.float32) / float(radius * 2 + 1)
|
|
100
|
+
return np.abs(values - np.convolve(values, k, mode="same")).astype(np.float32)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _byte_boundary(signal: np.ndarray) -> np.ndarray:
|
|
104
|
+
"""vectorized per-byte boundary score from transition energy (no UTF-8 decoding)."""
|
|
105
|
+
if signal.size <= 1:
|
|
106
|
+
return np.zeros((0,), dtype=np.float32)
|
|
107
|
+
nbytes = (signal.size - 1) // 8
|
|
108
|
+
if nbytes == 0:
|
|
109
|
+
return np.zeros((0,), dtype=np.float32)
|
|
110
|
+
energy = _xor_delta(signal).astype(np.float32) + 0.5 * _xor_delta(_xor_delta(signal)).astype(np.float32)
|
|
111
|
+
starts = np.arange(nbytes) * 8
|
|
112
|
+
lo = np.maximum(0, starts - 4)
|
|
113
|
+
hi = np.minimum(energy.size, starts + 5)
|
|
114
|
+
csum = np.concatenate([[0.0], np.cumsum(energy)])
|
|
115
|
+
return ((csum[hi] - csum[lo]) / np.maximum(hi - lo, 1)).astype(np.float32)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def encode(data: bytes) -> HSLFrame:
|
|
119
|
+
"""bytes → HSLFrame (lossless; carries the bitstream, Δ, Δ²)."""
|
|
120
|
+
bits = _bytes_to_bits(data)
|
|
121
|
+
signal = np.concatenate([bits, np.asarray([CLOSURE_BIT], dtype=np.uint8)])
|
|
122
|
+
delta = _xor_delta(signal)
|
|
123
|
+
return HSLFrame(len(data), bits, signal, delta, _xor_delta(delta), _byte_boundary(signal))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def decode(frame: HSLFrame) -> bytes:
|
|
127
|
+
"""HSLFrame → bytes (integrate Δ from origin 0, check closure, drop it)."""
|
|
128
|
+
signal = _integrate(frame.delta, origin=ORIGIN_BIT)
|
|
129
|
+
need = frame.payload_len_bytes * 8 + 1
|
|
130
|
+
if signal.size < need or int(signal[need - 1]) != CLOSURE_BIT:
|
|
131
|
+
raise ValueError("closure / length check failed")
|
|
132
|
+
return _bits_to_bytes(signal[: need - 1], frame.payload_len_bytes)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ───────────────────────────── embedding (torch) ─────────────────────────────
|
|
136
|
+
def embed(data: bytes, include_bits: bool = False):
|
|
137
|
+
"""bytes → (feats [L, 21|29], phase [L]). Deterministic, non-learned, pure change-rate.
|
|
138
|
+
|
|
139
|
+
include_bits=False (default, 21-D): the change-rate substrate — Δ, Δ², boundary, Fourier, phase.
|
|
140
|
+
The raw byte bits are dropped because Δ-from-origin-0 already encodes them losslessly.
|
|
141
|
+
include_bits=True (29-D): also prepend the 8 raw bits — for the original trained HoLo model.
|
|
142
|
+
"""
|
|
143
|
+
if len(data) == 0:
|
|
144
|
+
data = b"\x00"
|
|
145
|
+
fr = encode(data)
|
|
146
|
+
bc = fr.payload_len_bytes
|
|
147
|
+
bits = torch.from_numpy(fr.bits[: bc * 8].reshape(bc, 8).astype(np.float32))
|
|
148
|
+
dxor = torch.from_numpy(fr.delta[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ
|
|
149
|
+
d2xor = torch.from_numpy(fr.delta2[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ²
|
|
150
|
+
boundary = torch.from_numpy(fr.byte_boundary_score.reshape(bc, 1).astype(np.float32))
|
|
151
|
+
|
|
152
|
+
spec = torch.fft.rfft(bits, dim=1).abs() # per-byte 8-bit spectrum
|
|
153
|
+
low, high = spec[:, :3].sum(1, keepdim=True), spec[:, 3:].sum(1, keepdim=True)
|
|
154
|
+
fourier = torch.cat([low, high], dim=1) / (low + high + 1e-6)
|
|
155
|
+
|
|
156
|
+
a = torch.from_numpy(np.frombuffer(data, dtype=np.uint8).astype(np.float32))
|
|
157
|
+
angle = a / 256.0 * (2.0 * math.pi) # exact phase θ
|
|
158
|
+
phasor = torch.stack([torch.cos(angle), torch.sin(angle)], dim=1)
|
|
159
|
+
|
|
160
|
+
parts = [bits, dxor, d2xor, boundary, fourier, phasor] if include_bits else [dxor, d2xor, boundary, fourier, phasor]
|
|
161
|
+
return torch.cat(parts, dim=1), angle # [bc, 29] or [bc, 21]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class Embedding(nn.Module):
|
|
165
|
+
"""HSL byte → signal embedding as an nn.Module (no parameters), usable like nn.Embedding.
|
|
166
|
+
|
|
167
|
+
self.hsl = hsl.Embedding()
|
|
168
|
+
feats = self.hsl(b"...") # [L, 21] (include_bits=True -> [L, 29])
|
|
169
|
+
feats, phase = self.hsl(b"...", return_phase=True)
|
|
170
|
+
"""
|
|
171
|
+
def __init__(self, include_bits: bool = False):
|
|
172
|
+
super().__init__()
|
|
173
|
+
self.include_bits = include_bits
|
|
174
|
+
self.out_dim = FEAT_DIM_FULL if include_bits else FEAT_DIM
|
|
175
|
+
|
|
176
|
+
def forward(self, data: bytes, return_phase: bool = False):
|
|
177
|
+
feats, phase = embed(data, self.include_bits)
|
|
178
|
+
return (feats, phase) if return_phase else feats
|
|
179
|
+
|
|
180
|
+
def pack(self, byte_list: list[bytes], max_len: int):
|
|
181
|
+
"""list[bytes] → feats[B,L,out_dim], phase[B,L], mask[B,L] (pad/truncate to max_len)."""
|
|
182
|
+
B = len(byte_list)
|
|
183
|
+
feats = torch.zeros(B, max_len, self.out_dim)
|
|
184
|
+
phase = torch.zeros(B, max_len)
|
|
185
|
+
mask = torch.zeros(B, max_len)
|
|
186
|
+
for i, data in enumerate(byte_list):
|
|
187
|
+
f, p = embed(data, self.include_bits)
|
|
188
|
+
n = min(f.shape[0], max_len)
|
|
189
|
+
feats[i, :n], phase[i, :n], mask[i, :n] = f[:n], p[:n], 1.0
|
|
190
|
+
return feats, phase, mask
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hsl-embedding"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Jinhyun Woo", email = "ggunio5782@gmail.com" }]
|
|
13
|
+
keywords = ["embedding", "byte-native", "multimodal", "tokenizer-free", "signal", "change-rate", "pytorch"]
|
|
14
|
+
dependencies = ["numpy>=1.21", "torch>=1.12"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/Woojiggun/holo-hsl"
|
|
25
|
+
Paper = "https://doi.org/10.5281/zenodo.20581805"
|
|
26
|
+
Demo = "https://holo-demo-p5txmh4dda-as.a.run.app"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["hsl_embedding"]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""HSL invariants — run with `pytest`."""
|
|
2
|
+
import torch
|
|
3
|
+
import hsl_embedding as hsl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SAMPLES = [b"", b"\x00", b"A", "강아지 dog 🐕 0101".encode(), bytes(range(256)), b"hello world " * 30]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_lossless_roundtrip():
|
|
10
|
+
for d in SAMPLES:
|
|
11
|
+
if not d:
|
|
12
|
+
continue
|
|
13
|
+
assert hsl.decode(hsl.encode(d)) == d
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_shapes_and_dims():
|
|
17
|
+
feats, phase = hsl.embed(b"hello")
|
|
18
|
+
assert feats.shape == (5, hsl.FEAT_DIM)
|
|
19
|
+
assert phase.shape == (5,)
|
|
20
|
+
assert feats.dtype == torch.float32
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_default_is_lean_21d():
|
|
24
|
+
feats, _ = hsl.embed(b"hello") # default drops the raw bits
|
|
25
|
+
assert feats.shape[1] == hsl.FEAT_DIM == 21
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_lean_is_full_minus_bits():
|
|
29
|
+
for d in SAMPLES:
|
|
30
|
+
full, _ = hsl.embed(d or b"\x00", include_bits=True)
|
|
31
|
+
lean, _ = hsl.embed(d or b"\x00", include_bits=False)
|
|
32
|
+
assert full.shape[1] == hsl.FEAT_DIM_FULL == 29
|
|
33
|
+
assert lean.shape[1] == hsl.FEAT_DIM == 21
|
|
34
|
+
assert torch.allclose(lean, full[:, 8:], atol=1e-6) # the dropped 8 are the raw bits
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_delta_is_lossless_core():
|
|
38
|
+
# Δ-from-origin-0 integrates back to the bitstream -> bits channel is redundant
|
|
39
|
+
fr = hsl.encode(b"redundancy?")
|
|
40
|
+
rec = hsl._integrate(fr.delta, origin=hsl.ORIGIN_BIT)
|
|
41
|
+
assert (rec == fr.signal).all()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_module_and_pack():
|
|
45
|
+
emb = hsl.Embedding() # default = 21-D
|
|
46
|
+
assert emb.out_dim == 21
|
|
47
|
+
assert emb(b"test").shape == (4, 21)
|
|
48
|
+
assert hsl.Embedding(include_bits=True).out_dim == 29
|
|
49
|
+
feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8)
|
|
50
|
+
assert feats.shape == (2, 8, 21)
|
|
51
|
+
assert mask.sum(1).tolist() == [1.0, 6.0]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_empty_input_is_safe():
|
|
55
|
+
feats, _ = hsl.embed(b"")
|
|
56
|
+
assert feats.shape[0] >= 1 # empty -> treated as a single zero byte, never crashes
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_feat_names():
|
|
60
|
+
assert len(hsl.feat_names(True)) == 29
|
|
61
|
+
assert len(hsl.feat_names(False)) == 21
|
|
62
|
+
assert hsl.feat_names(False)[0] == "dxor0" # change-rate is the first channel of the lean substrate
|