hsl-embedding 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ # build / packaging
2
+ build/
3
+ dist/
4
+ *.egg-info/
5
+ *.egg
6
+
7
+ # python
8
+ __pycache__/
9
+ *.py[cod]
10
+ .pytest_cache/
11
+ .tox/
12
+ .venv/
13
+ venv/
14
+
15
+ # os / editor
16
+ .DS_Store
17
+ Thumbs.db
18
+ .idea/
19
+ .vscode/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jinhyun Woo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: hsl-embedding
3
+ Version: 0.1.0
4
+ Summary: HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible.
5
+ Project-URL: Homepage, https://github.com/Woojiggun/holo-hsl
6
+ Project-URL: Paper, https://doi.org/10.5281/zenodo.20581805
7
+ Project-URL: Demo, https://holo-demo-p5txmh4dda-as.a.run.app
8
+ Author-email: Jinhyun Woo <ggunio5782@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: byte-native,change-rate,embedding,multimodal,pytorch,signal,tokenizer-free
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.9
18
+ Requires-Dist: numpy>=1.21
19
+ Requires-Dist: torch>=1.12
20
+ Description-Content-Type: text/markdown
21
+
22
+ # HSL — Holistic Signal Language
23
+
24
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.20581805.svg)](https://doi.org/10.5281/zenodo.20581805)
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
26
+
27
+ **A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
28
+ raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
29
+ Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
30
+ modality-agnostic input layer for text, image, audio, video — any byte stream.
31
+
32
+ > Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
33
+ > measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
34
+
35
+ ```python
36
+ import hsl_embedding as hsl
37
+
38
+ feats, phase = hsl.embed(b"hello") # -> Tensor [L, 21], Tensor [L]
39
+ emb = hsl.Embedding() # an nn.Module, no parameters (like nn.Embedding)
40
+ feats = emb("강아지".encode()) # -> [L, 21]
41
+ assert hsl.decode(hsl.encode(b"hello")) == b"hello" # lossless, by construction
42
+ ```
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ pip install hsl-embedding # distribution name; import as `import hsl_embedding as hsl`
48
+ # deps: numpy, torch
49
+ ```
50
+
51
+ ## Why not just `nn.Embedding`?
52
+
53
+ They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
54
+
55
+ | | `torch.nn.Embedding` | `hsl.Embedding` |
56
+ |---|---|---|
57
+ | what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
58
+ | input | a token id (`int`) | raw `bytes` |
59
+ | needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
60
+ | dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
61
+ | modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
62
+ | invertible | no | **yes** (`decode(encode(x)) == x`) |
63
+ | new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
64
+
65
+ **They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
66
+ learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
67
+ HSL features.
68
+
69
+ **Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
70
+ features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
71
+
72
+ ## What each channel captures (and where it's good)
73
+
74
+ HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
75
+ throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
76
+
77
+ | channel (dims) | exact formula | captures | especially good for |
78
+ |---|---|---|---|
79
+ | **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
80
+ | **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
81
+ | **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
82
+ | **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
83
+ | **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
84
+
85
+ The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
86
+ spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
87
+ modality needs them.
88
+
89
+ *Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
90
+ already encodes the bytes losslessly), included only to match the original trained HoLo model.
91
+
92
+ ## Lossless by construction
93
+
94
+ The features are grounded in a lossless codec, so the substrate is byte-exact:
95
+
96
+ ```python
97
+ frame = hsl.encode(b"any bytes \x00\xff")
98
+ hsl.decode(frame) == b"any bytes \x00\xff" # True
99
+ ```
100
+ Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
101
+ raw `bits` channel is redundant and can be dropped.
102
+
103
+ ## 21-D (default) vs 29-D (legacy)
104
+
105
+ ```python
106
+ hsl.embed(data) # 21-D (default; pure change-rate, no redundant bits)
107
+ hsl.embed(data, include_bits=True) # 29-D (also prepend the 8 raw bits — original HoLo model)
108
+ hsl.Embedding(include_bits=True).out_dim # 29
109
+ ```
110
+
111
+ ## Batch
112
+
113
+ ```python
114
+ emb = hsl.Embedding()
115
+ feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8) # [B, L, D], [B, L], [B, L]
116
+ ```
117
+
118
+ ## Examples
119
+
120
+ ```bash
121
+ python examples/quickstart.py # bytes in, features out; named channels
122
+ python examples/roundtrip_all.py # text / image / audio / video -> embed -> EXACT reconstruction
123
+ python examples/vs_nn_embedding.py # nn.Embedding vs hsl.Embedding — when to use which
124
+ python examples/benchmark_vs_nn.py # honest capability + speed comparison
125
+ ```
126
+
127
+ `roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
128
+
129
+ ```
130
+ modality bytes feat shape reconstruction
131
+ ----------------------------------------------------------------
132
+ text (utf-8) 98 (98, 21) EXACT ✓
133
+ image (RGB u8) 3072 (3072, 21) EXACT ✓
134
+ audio (PCM i16) 8000 (8000, 21) EXACT ✓
135
+ video (6 frames) 4608 (4608, 21) EXACT ✓
136
+ ```
137
+
138
+ ## Scope (honest)
139
+
140
+ HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
141
+ benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
142
+ top. See the paper and live demo:
143
+
144
+ - 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
145
+ - 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
146
+ - 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
147
+
148
+ ## License & citation
149
+
150
+ **MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
151
+ Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
152
+ copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
153
+
154
+ ```bibtex
155
+ @software{woo_hsl_2026,
156
+ author = {Jinhyun Woo},
157
+ title = {HSL: a byte-native, modality-agnostic signal embedding},
158
+ year = {2026},
159
+ doi = {10.5281/zenodo.20581805},
160
+ url = {https://github.com/Woojiggun/holo-hsl}
161
+ }
162
+ ```
@@ -0,0 +1,141 @@
1
+ # HSL — Holistic Signal Language
2
+
3
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.20581805.svg)](https://doi.org/10.5281/zenodo.20581805)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
5
+
6
+ **A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
7
+ raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
8
+ Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
9
+ modality-agnostic input layer for text, image, audio, video — any byte stream.
10
+
11
+ > Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
12
+ > measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
13
+
14
+ ```python
15
+ import hsl_embedding as hsl
16
+
17
+ feats, phase = hsl.embed(b"hello") # -> Tensor [L, 21], Tensor [L]
18
+ emb = hsl.Embedding() # an nn.Module, no parameters (like nn.Embedding)
19
+ feats = emb("강아지".encode()) # -> [L, 21]
20
+ assert hsl.decode(hsl.encode(b"hello")) == b"hello" # lossless, by construction
21
+ ```
22
+
23
+ ## Install
24
+
25
+ ```bash
26
+ pip install hsl-embedding # distribution name; import as `import hsl_embedding as hsl`
27
+ # deps: numpy, torch
28
+ ```
29
+
30
+ ## Why not just `nn.Embedding`?
31
+
32
+ They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
33
+
34
+ | | `torch.nn.Embedding` | `hsl.Embedding` |
35
+ |---|---|---|
36
+ | what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
37
+ | input | a token id (`int`) | raw `bytes` |
38
+ | needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
39
+ | dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
40
+ | modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
41
+ | invertible | no | **yes** (`decode(encode(x)) == x`) |
42
+ | new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
43
+
44
+ **They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
45
+ learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
46
+ HSL features.
47
+
48
+ **Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
49
+ features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
50
+
51
+ ## What each channel captures (and where it's good)
52
+
53
+ HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
54
+ throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
55
+
56
+ | channel (dims) | exact formula | captures | especially good for |
57
+ |---|---|---|---|
58
+ | **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
59
+ | **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
60
+ | **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
61
+ | **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
62
+ | **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
63
+
64
+ The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
65
+ spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
66
+ modality needs them.
67
+
68
+ *Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
69
+ already encodes the bytes losslessly), included only to match the original trained HoLo model.
70
+
71
+ ## Lossless by construction
72
+
73
+ The features are grounded in a lossless codec, so the substrate is byte-exact:
74
+
75
+ ```python
76
+ frame = hsl.encode(b"any bytes \x00\xff")
77
+ hsl.decode(frame) == b"any bytes \x00\xff" # True
78
+ ```
79
+ Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
80
+ raw `bits` channel is redundant and can be dropped.
81
+
82
+ ## 21-D (default) vs 29-D (legacy)
83
+
84
+ ```python
85
+ hsl.embed(data) # 21-D (default; pure change-rate, no redundant bits)
86
+ hsl.embed(data, include_bits=True) # 29-D (also prepend the 8 raw bits — original HoLo model)
87
+ hsl.Embedding(include_bits=True).out_dim # 29
88
+ ```
89
+
90
+ ## Batch
91
+
92
+ ```python
93
+ emb = hsl.Embedding()
94
+ feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8) # [B, L, D], [B, L], [B, L]
95
+ ```
96
+
97
+ ## Examples
98
+
99
+ ```bash
100
+ python examples/quickstart.py # bytes in, features out; named channels
101
+ python examples/roundtrip_all.py # text / image / audio / video -> embed -> EXACT reconstruction
102
+ python examples/vs_nn_embedding.py # nn.Embedding vs hsl.Embedding — when to use which
103
+ python examples/benchmark_vs_nn.py # honest capability + speed comparison
104
+ ```
105
+
106
+ `roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
107
+
108
+ ```
109
+ modality bytes feat shape reconstruction
110
+ ----------------------------------------------------------------
111
+ text (utf-8) 98 (98, 21) EXACT ✓
112
+ image (RGB u8) 3072 (3072, 21) EXACT ✓
113
+ audio (PCM i16) 8000 (8000, 21) EXACT ✓
114
+ video (6 frames) 4608 (4608, 21) EXACT ✓
115
+ ```
116
+
117
+ ## Scope (honest)
118
+
119
+ HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
120
+ benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
121
+ top. See the paper and live demo:
122
+
123
+ - 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
124
+ - 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
125
+ - 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
126
+
127
+ ## License & citation
128
+
129
+ **MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
130
+ Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
131
+ copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
132
+
133
+ ```bibtex
134
+ @software{woo_hsl_2026,
135
+ author = {Jinhyun Woo},
136
+ title = {HSL: a byte-native, modality-agnostic signal embedding},
137
+ year = {2026},
138
+ doi = {10.5281/zenodo.20581805},
139
+ url = {https://github.com/Woojiggun/holo-hsl}
140
+ }
141
+ ```
@@ -0,0 +1,70 @@
1
+ """HSL vs torch.nn.Embedding — an honest benchmark (capabilities + a few real measurements).
2
+
3
+ This is NOT a "HSL is better" pitch. They are different tools:
4
+ nn.Embedding is a fast learned lookup table; HSL is an exact, invertible, modality-agnostic signal.
5
+ We report what each *can* and *cannot* do, and we're upfront that nn.Embedding is faster at raw lookup.
6
+ """
7
+ import time
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import hsl_embedding as hsl
12
+
13
+ blob = (np.random.RandomState(0).rand(20000) * 256).astype(np.uint8).tobytes() # 20 KB of bytes
14
+ ids = torch.tensor(list(blob))
15
+ D = hsl.FEAT_DIM # 21 (default change-rate substrate)
16
+
17
+ nn_emb = nn.Embedding(256, D) # smallest fair vocab = 256 byte values
18
+ hsl_emb = hsl.Embedding()
19
+
20
+ # ---- 1) capability matrix -------------------------------------------------------------
21
+ def yn(b): return "yes ✓" if b else "no ✗"
22
+ print("capability nn.Embedding hsl.Embedding")
23
+ print("-" * 70)
24
+ rows = [
25
+ ("learnable parameters", f"{256*D:,} (trained)", "0 (formula)"),
26
+ ("needs a tokenizer / vocab", "yes", "no (raw bytes)"),
27
+ ("meaningful before any training", "no ✗", "yes ✓"),
28
+ ("one encoder across modalities", "no ✗ (per-modality)", "yes ✓"),
29
+ ("handles any of 256 byte values", "only if in vocab", "yes ✓ (all)"),
30
+ ("invertible (reconstruct input)", "no ✗", "yes ✓ (lossless)"),
31
+ ("interpretable dims", "no ✗ (opaque)", "yes ✓ (Δ/Δ²/FFT/phase)"),
32
+ ]
33
+ for a, b, c in rows:
34
+ print(f"{a:34} {b:19} {c}")
35
+
36
+ # ---- 2) reconstruction: can you get the input back? -----------------------------------
37
+ restored = hsl.decode(hsl.encode(blob))
38
+ print(f"\nreconstruction error HSL: {0 if restored == blob else 1}.0 (exact) "
39
+ f"nn.Embedding: N/A (a learned vector cannot be inverted to the input)")
40
+
41
+ # ---- 3) unseen value: nn.Embedding with a smaller vocab breaks; HSL never does --------
42
+ small = nn.Embedding(128, D) # vocab only covers bytes 0..127
43
+ try:
44
+ small(torch.tensor([200])) # byte 200 -> out of range
45
+ nn_ok = True
46
+ except Exception:
47
+ nn_ok = False
48
+ hsl.embed(bytes([200])) # always fine
49
+ print(f"unseen byte (200) with vocab=128 nn.Embedding: {'ok' if nn_ok else 'IndexError ✗'} HSL: ok ✓")
50
+
51
+ # ---- 4) throughput: a one-time input transform (NOT a like-for-like race) -------------
52
+ # nn.Embedding does a memory lookup; HSL *computes* an exact signal. These are different jobs,
53
+ # so this is not a fair head-to-head — HSL is a feature transform you run once and cache, the way
54
+ # you would any preprocessing. We report its throughput for context, not as a competition.
55
+ hsl_emb(blob) # warm up
56
+ t = time.perf_counter()
57
+ for _ in range(20):
58
+ hsl_emb(blob)
59
+ mbps = 20 / ((time.perf_counter() - t) / 20) / 1024
60
+ print(f"\nHSL feature-extraction throughput: ~{mbps:.1f} MB/s (one-time transform; cache and reuse)")
61
+ print("nn.Embedding is a table lookup, not a signal computation — speed isn't a meaningful comparison.")
62
+
63
+ print("""
64
+ Takeaway
65
+ --------
66
+ nn.Embedding -> a fast learned lookup; needs a vocab + training; one per modality.
67
+ hsl.Embedding -> zero params, no training, one substrate for every modality, exact & invertible,
68
+ interpretable channels. It computes a signal (so it's a one-time input transform,
69
+ not a lookup). Use HSL for the input layer; learn meaning on top.
70
+ """)
@@ -0,0 +1,23 @@
1
+ """Quickstart — bytes in, signal features out. No tokenizer, no training."""
2
+ import hsl_embedding as hsl
3
+
4
+ # 1) functional: any bytes -> [L, 29] features + [L] phase
5
+ feats, phase = hsl.embed("변화율이 공통 언어다".encode())
6
+ print("feats", tuple(feats.shape), "| phase", tuple(phase.shape))
7
+
8
+ # 2) as an nn.Module (no parameters) — drop into a model like nn.Embedding
9
+ emb = hsl.Embedding() # or Embedding(include_bits=False) for the lean 21-D
10
+ print("out_dim", emb.out_dim)
11
+ x = emb(b"\x89PNG\r\n\x1a\n") # works on image bytes just the same
12
+ print("image bytes ->", tuple(x.shape))
13
+
14
+ # 3) named channels — read what each dimension means
15
+ names = hsl.feat_names(include_bits=True)
16
+ row0 = feats[0]
17
+ for name, val in list(zip(names, row0.tolist()))[8:24]: # Δ and Δ² channels
18
+ print(f" {name:8} {val:+.0f}")
19
+
20
+ # 4) lossless — the substrate is byte-exact
21
+ b = b"round trip \x00\xff"
22
+ assert hsl.decode(hsl.encode(b)) == b
23
+ print("lossless:", True)
@@ -0,0 +1,50 @@
1
+ """One encoder, any modality, exact reconstruction.
2
+
3
+ HSL reads text / image / audio / video as the SAME thing — bytes — and its substrate is lossless,
4
+ so the original comes back *exactly*. Here we embed each modality and rebuild it straight from the
5
+ embedding's Δ (change-rate) channel. No tokenizer, no per-modality code, no information lost.
6
+ (Self-contained: samples are synthesized with numpy; no extra dependencies.)
7
+ """
8
+ import numpy as np
9
+ import torch
10
+ import hsl_embedding as hsl
11
+
12
+
13
+ def restore_from_embedding(feats: torch.Tensor) -> bytes:
14
+ """Rebuild the original bytes straight from the embedding's Δ channel (dxor = first 8 dims).
15
+ Δ-from-origin-0 is a lossless re-encoding: cumulative-XOR integrates it back to the bytes."""
16
+ dxor = feats[:, 0:8].reshape(-1).round().to(torch.uint8).numpy() # the per-bit change-rate
17
+ bits = np.empty_like(dxor)
18
+ prev = 0
19
+ for i, v in enumerate(dxor): # integrate Δ from origin 0
20
+ prev ^= int(v); bits[i] = prev
21
+ return np.packbits(bits, bitorder="big").tobytes()
22
+
23
+
24
+ # --- synthesize one real sample per modality (as its natural raw bytes) ---------------
25
+ text = "변화율은 모든 모달리티의 공통 언어다. Everything is a fluctuation between 0 and 1.".encode("utf-8")
26
+ image = (np.add.outer(np.arange(32), np.arange(32)) % 256).astype(np.uint8) # 32x32 gradient
27
+ image = np.stack([image, image[::-1], image.T], -1).astype(np.uint8) # 32x32x3 RGB
28
+ audio = (np.sin(np.linspace(0, 50 * np.pi, 4000)) * 30000).astype(np.int16) # 4000-sample tone
29
+ video = (np.random.RandomState(0).rand(6, 16, 16, 3) * 255).astype(np.uint8) # 6 frames 16x16 RGB
30
+
31
+ samples = {
32
+ "text (utf-8)": (text, text),
33
+ "image (RGB u8)": (image.tobytes(), image),
34
+ "audio (PCM i16)": (audio.tobytes(), audio),
35
+ "video (6 frames)":(video.tobytes(), video),
36
+ }
37
+
38
+ print(f"{'modality':18} {'bytes':>8} {'feat shape':>14} reconstruction")
39
+ print("-" * 64)
40
+ for name, (raw, original) in samples.items():
41
+ feats, _ = hsl.embed(raw) # ONE call, any modality -> [L, 29]
42
+ restored = restore_from_embedding(feats) # rebuild straight from the embedding
43
+ exact = restored == raw
44
+ # round-trip back into the modality's native array, bit-for-bit
45
+ if isinstance(original, np.ndarray):
46
+ rebuilt = np.frombuffer(restored, dtype=original.dtype).reshape(original.shape)
47
+ exact = exact and np.array_equal(rebuilt, original)
48
+ print(f"{name:18} {len(raw):>8} {str(tuple(feats.shape)):>14} {'EXACT ✓' if exact else 'MISMATCH ✗'}")
49
+
50
+ print("\nOne modality-agnostic encoder. Lossless by construction — embed, then restore the original exactly.")
@@ -0,0 +1,45 @@
1
+ """hsl.Embedding vs torch.nn.Embedding — *when to use which* (not a performance comparison).
2
+
3
+ nn.Embedding : token id -> learned vector. Needs a tokenizer + vocab + training. One per modality.
4
+ hsl.Embedding : raw bytes -> exact signal features. No tokenizer, no params, works across modalities.
5
+ They compose: stack nn layers ON TOP of HSL features.
6
+ """
7
+ import torch
8
+ import torch.nn as nn
9
+ import hsl_embedding as hsl
10
+
11
+ text = "강아지".encode("utf-8")
12
+ image_bytes = bytes([0, 0, 5, 250, 255, 250, 5, 0]) # a tiny 1-D "edge"
13
+ audio_bytes = bytes([128, 130, 126, 160, 96, 200, 56]) # a tiny "transient"
14
+
15
+ # --- nn.Embedding: you must first define a vocab and tokenize -------------------------
16
+ vocab_size = 256
17
+ nn_emb = nn.Embedding(vocab_size, 32) # 256*32 LEARNED params, random until trained
18
+ ids = torch.tensor(list(text)) # you had to choose a tokenization (here: bytes)
19
+ print("nn.Embedding:", tuple(nn_emb(ids).shape), "(learned, random until trained; needs a vocab)")
20
+
21
+ # --- hsl.Embedding: bytes straight in, meaningful from day one -------------------------
22
+ hsl_emb = hsl.Embedding() # 0 params, deterministic
23
+ for name, b in [("text", text), ("image", image_bytes), ("audio", audio_bytes)]:
24
+ feats = hsl_emb(b)
25
+ print(f"hsl.Embedding({name:5}): {tuple(feats.shape)} - same call, any modality")
26
+
27
+ # --- what HSL gives that a single learned vector blurs together -----------------------
28
+ names = hsl.feat_names() # default 21-D channel names
29
+ feats = hsl_emb(audio_bytes)
30
+ print("\naudio 'transient' - interpretable channels at byte 3:")
31
+ for ch in ("dxor0", "d2xor0", "boundary", "fft_high_ratio", "phase_sin"):
32
+ print(f" {ch:14} = {feats[3, names.index(ch)]:+.3f}")
33
+
34
+ # --- composing: HSL features -> your learned head -------------------------------------
35
+ head = nn.Sequential(nn.Linear(hsl_emb.out_dim, 64), nn.GELU(), nn.Linear(64, 16))
36
+ out = head(hsl_emb(text)) # learn meaning on top of exact signal
37
+ print("\nHSL -> learned head:", tuple(out.shape))
38
+
39
+ print("""
40
+ Rule of thumb
41
+ -------------
42
+ nn.Embedding -> fixed vocab, lots of data, you want learned semantics.
43
+ hsl.Embedding -> tokenizer-free, cross-modal, structure/change-aware input, exact & invertible.
44
+ Best together -> HSL for the input substrate, nn layers for the meaning.
45
+ """)
@@ -0,0 +1,190 @@
1
+ """HSL — Holistic Signal Language: a non-learned, byte-level signal embedding (codec + encoder in one).
2
+
3
+ Everything is information — a fluctuation between 0 and 1. HSL turns raw bytes into a compact,
4
+ *change-rate-based* feature signal that any modality (text, image, audio, video, sensor) shares,
5
+ with no tokenizer and no learned parameters. The representation is grounded in a lossless codec,
6
+ so `decode(encode(x)) == x` — the substrate is byte-exact by construction.
7
+
8
+ The 21-D per-byte feature (FEAT_DIM) — the pure change-rate substrate:
9
+ dxor0..7 (8) Δ change-rate — XOR-delta from origin 0 (losslessly encodes the bytes)
10
+ d2xor0..7 (8) Δ² change-rate-of-change-rate — 2nd XOR-delta
11
+ boundary (1) byte-boundary evidence (|Δ| + 0.5|Δ²| + 0.25·HF)
12
+ fft_low/high (2) per-byte spectral amplitude bands
13
+ phase_cos/sin (2) exact complex phasor z = e^{iθ}, θ = 2π·byte/256
14
+
15
+ The raw 8 bits are NOT included by default: Δ-from-origin-0 already encodes the bytes losslessly,
16
+ so the bits are redundant. Pass include_bits=True for the legacy 29-D (raw bits prepended).
17
+
18
+ import hsl_embedding as hsl
19
+ feats, phase = hsl.embed(b"hello") # [L, 21], [L]
20
+ emb = hsl.Embedding(); feats = emb(b"hello")
21
+ assert hsl.decode(hsl.encode(b"hello")) == b"hello"
22
+
23
+ Author: Jinhyun Woo (ggunio5782@gmail.com). MIT-licensed; no learned weights included.
24
+ """
25
+ from __future__ import annotations
26
+ import math
27
+ from dataclasses import dataclass
28
+ from typing import Iterable
29
+
30
+ import numpy as np
31
+ import torch
32
+ import torch.nn as nn
33
+
34
+ __all__ = ["FEAT_DIM", "FEAT_DIM_FULL", "FEAT_NAMES", "FEAT_NAMES_FULL", "feat_names",
35
+ "ORIGIN_BIT", "CLOSURE_BIT", "HSLFrame", "encode", "decode", "embed", "Embedding"]
36
+
37
+ ORIGIN_BIT = 0 # the "0": origin enabling lossless reconstruction
38
+ CLOSURE_BIT = 1 # the "1": learned closure / end-of-content endpoint
39
+ FEAT_DIM = 21 # default: the pure change-rate substrate (Δ, Δ², boundary, Fourier, phase)
40
+ FEAT_DIM_FULL = 29 # include_bits=True: also prepend the 8 raw bits (redundant with Δ; legacy compat)
41
+ _BITS = [f"bit{i}" for i in range(8)]
42
+ _REST = ([f"dxor{i}" for i in range(8)] # Δ
43
+ + [f"d2xor{i}" for i in range(8)] # Δ²
44
+ + ["boundary", "fft_low_ratio", "fft_high_ratio", "phase_cos", "phase_sin"])
45
+ FEAT_NAMES = list(_REST) # 21 (default — change-rate substrate)
46
+ FEAT_NAMES_FULL = _BITS + _REST # 29 (with raw bits)
47
+
48
+
49
+ def feat_names(include_bits: bool = False):
50
+ return (_BITS + _REST) if include_bits else list(_REST)
51
+
52
+
53
+ # ───────────────────────────── codec (numpy, lossless) ─────────────────────────────
54
+ @dataclass(frozen=True)
55
+ class HSLFrame:
56
+ payload_len_bytes: int
57
+ bits: np.ndarray
58
+ signal: np.ndarray # bits + closure (the 0 → … → 1 journey)
59
+ delta: np.ndarray # Δ (XOR-delta from origin 0)
60
+ delta2: np.ndarray # Δ²
61
+ byte_boundary_score: np.ndarray
62
+
63
+
64
+ def _bytes_to_bits(data: bytes) -> np.ndarray:
65
+ if not data:
66
+ return np.zeros((0,), dtype=np.uint8)
67
+ return np.unpackbits(np.frombuffer(data, dtype=np.uint8), bitorder="big").astype(np.uint8)
68
+
69
+
70
+ def _bits_to_bytes(bits: np.ndarray, n: int) -> bytes:
71
+ need = n * 8
72
+ b = np.asarray(bits[:need], dtype=np.uint8)
73
+ if b.size != need:
74
+ raise ValueError(f"not enough bits: have {b.size}, need {need}")
75
+ return np.packbits(b, bitorder="big")[:n].tobytes() if b.size else b""
76
+
77
+
78
+ def _xor_delta(bits: np.ndarray, origin: int = ORIGIN_BIT) -> np.ndarray:
79
+ bits = np.asarray(bits, dtype=np.uint8)
80
+ prev = np.empty_like(bits)
81
+ if bits.size:
82
+ prev[0] = origin
83
+ prev[1:] = bits[:-1]
84
+ return np.bitwise_xor(bits, prev).astype(np.uint8)
85
+
86
+
87
+ def _integrate(delta: Iterable[int], origin: int = ORIGIN_BIT) -> np.ndarray:
88
+ out, prev = [], np.uint8(origin)
89
+ for v in delta:
90
+ prev = np.uint8(v) ^ prev
91
+ out.append(prev)
92
+ return np.asarray(out, dtype=np.uint8)
93
+
94
+
95
+ def _hf_energy(values: np.ndarray, radius: int = 4) -> np.ndarray:
96
+ values = np.asarray(values, dtype=np.float32)
97
+ if values.size == 0:
98
+ return values
99
+ k = np.ones((radius * 2 + 1,), dtype=np.float32) / float(radius * 2 + 1)
100
+ return np.abs(values - np.convolve(values, k, mode="same")).astype(np.float32)
101
+
102
+
103
+ def _byte_boundary(signal: np.ndarray) -> np.ndarray:
104
+ """vectorized per-byte boundary score from transition energy (no UTF-8 decoding)."""
105
+ if signal.size <= 1:
106
+ return np.zeros((0,), dtype=np.float32)
107
+ nbytes = (signal.size - 1) // 8
108
+ if nbytes == 0:
109
+ return np.zeros((0,), dtype=np.float32)
110
+ energy = _xor_delta(signal).astype(np.float32) + 0.5 * _xor_delta(_xor_delta(signal)).astype(np.float32)
111
+ starts = np.arange(nbytes) * 8
112
+ lo = np.maximum(0, starts - 4)
113
+ hi = np.minimum(energy.size, starts + 5)
114
+ csum = np.concatenate([[0.0], np.cumsum(energy)])
115
+ return ((csum[hi] - csum[lo]) / np.maximum(hi - lo, 1)).astype(np.float32)
116
+
117
+
118
+ def encode(data: bytes) -> HSLFrame:
119
+ """bytes → HSLFrame (lossless; carries the bitstream, Δ, Δ²)."""
120
+ bits = _bytes_to_bits(data)
121
+ signal = np.concatenate([bits, np.asarray([CLOSURE_BIT], dtype=np.uint8)])
122
+ delta = _xor_delta(signal)
123
+ return HSLFrame(len(data), bits, signal, delta, _xor_delta(delta), _byte_boundary(signal))
124
+
125
+
126
+ def decode(frame: HSLFrame) -> bytes:
127
+ """HSLFrame → bytes (integrate Δ from origin 0, check closure, drop it)."""
128
+ signal = _integrate(frame.delta, origin=ORIGIN_BIT)
129
+ need = frame.payload_len_bytes * 8 + 1
130
+ if signal.size < need or int(signal[need - 1]) != CLOSURE_BIT:
131
+ raise ValueError("closure / length check failed")
132
+ return _bits_to_bytes(signal[: need - 1], frame.payload_len_bytes)
133
+
134
+
135
+ # ───────────────────────────── embedding (torch) ─────────────────────────────
136
+ def embed(data: bytes, include_bits: bool = False):
137
+ """bytes → (feats [L, 21|29], phase [L]). Deterministic, non-learned, pure change-rate.
138
+
139
+ include_bits=False (default, 21-D): the change-rate substrate — Δ, Δ², boundary, Fourier, phase.
140
+ The raw byte bits are dropped because Δ-from-origin-0 already encodes them losslessly.
141
+ include_bits=True (29-D): also prepend the 8 raw bits — for the original trained HoLo model.
142
+ """
143
+ if len(data) == 0:
144
+ data = b"\x00"
145
+ fr = encode(data)
146
+ bc = fr.payload_len_bytes
147
+ bits = torch.from_numpy(fr.bits[: bc * 8].reshape(bc, 8).astype(np.float32))
148
+ dxor = torch.from_numpy(fr.delta[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ
149
+ d2xor = torch.from_numpy(fr.delta2[: bc * 8].reshape(bc, 8).astype(np.float32)) # Δ²
150
+ boundary = torch.from_numpy(fr.byte_boundary_score.reshape(bc, 1).astype(np.float32))
151
+
152
+ spec = torch.fft.rfft(bits, dim=1).abs() # per-byte 8-bit spectrum
153
+ low, high = spec[:, :3].sum(1, keepdim=True), spec[:, 3:].sum(1, keepdim=True)
154
+ fourier = torch.cat([low, high], dim=1) / (low + high + 1e-6)
155
+
156
+ a = torch.from_numpy(np.frombuffer(data, dtype=np.uint8).astype(np.float32))
157
+ angle = a / 256.0 * (2.0 * math.pi) # exact phase θ
158
+ phasor = torch.stack([torch.cos(angle), torch.sin(angle)], dim=1)
159
+
160
+ parts = [bits, dxor, d2xor, boundary, fourier, phasor] if include_bits else [dxor, d2xor, boundary, fourier, phasor]
161
+ return torch.cat(parts, dim=1), angle # [bc, 29] or [bc, 21]
162
+
163
+
164
+ class Embedding(nn.Module):
165
+ """HSL byte → signal embedding as an nn.Module (no parameters), usable like nn.Embedding.
166
+
167
+ self.hsl = hsl.Embedding()
168
+ feats = self.hsl(b"...") # [L, 21] (include_bits=True -> [L, 29])
169
+ feats, phase = self.hsl(b"...", return_phase=True)
170
+ """
171
+ def __init__(self, include_bits: bool = False):
172
+ super().__init__()
173
+ self.include_bits = include_bits
174
+ self.out_dim = FEAT_DIM_FULL if include_bits else FEAT_DIM
175
+
176
+ def forward(self, data: bytes, return_phase: bool = False):
177
+ feats, phase = embed(data, self.include_bits)
178
+ return (feats, phase) if return_phase else feats
179
+
180
+ def pack(self, byte_list: list[bytes], max_len: int):
181
+ """list[bytes] → feats[B,L,out_dim], phase[B,L], mask[B,L] (pad/truncate to max_len)."""
182
+ B = len(byte_list)
183
+ feats = torch.zeros(B, max_len, self.out_dim)
184
+ phase = torch.zeros(B, max_len)
185
+ mask = torch.zeros(B, max_len)
186
+ for i, data in enumerate(byte_list):
187
+ f, p = embed(data, self.include_bits)
188
+ n = min(f.shape[0], max_len)
189
+ feats[i, :n], phase[i, :n], mask[i, :n] = f[:n], p[:n], 1.0
190
+ return feats, phase, mask
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "hsl-embedding"
7
+ version = "0.1.0"
8
+ description = "HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Jinhyun Woo", email = "ggunio5782@gmail.com" }]
13
+ keywords = ["embedding", "byte-native", "multimodal", "tokenizer-free", "signal", "change-rate", "pytorch"]
14
+ dependencies = ["numpy>=1.21", "torch>=1.12"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/Woojiggun/holo-hsl"
25
+ Paper = "https://doi.org/10.5281/zenodo.20581805"
26
+ Demo = "https://holo-demo-p5txmh4dda-as.a.run.app"
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["hsl_embedding"]
@@ -0,0 +1,62 @@
1
+ """HSL invariants — run with `pytest`."""
2
+ import torch
3
+ import hsl_embedding as hsl
4
+
5
+
6
+ SAMPLES = [b"", b"\x00", b"A", "강아지 dog 🐕 0101".encode(), bytes(range(256)), b"hello world " * 30]
7
+
8
+
9
+ def test_lossless_roundtrip():
10
+ for d in SAMPLES:
11
+ if not d:
12
+ continue
13
+ assert hsl.decode(hsl.encode(d)) == d
14
+
15
+
16
+ def test_shapes_and_dims():
17
+ feats, phase = hsl.embed(b"hello")
18
+ assert feats.shape == (5, hsl.FEAT_DIM)
19
+ assert phase.shape == (5,)
20
+ assert feats.dtype == torch.float32
21
+
22
+
23
+ def test_default_is_lean_21d():
24
+ feats, _ = hsl.embed(b"hello") # default drops the raw bits
25
+ assert feats.shape[1] == hsl.FEAT_DIM == 21
26
+
27
+
28
+ def test_lean_is_full_minus_bits():
29
+ for d in SAMPLES:
30
+ full, _ = hsl.embed(d or b"\x00", include_bits=True)
31
+ lean, _ = hsl.embed(d or b"\x00", include_bits=False)
32
+ assert full.shape[1] == hsl.FEAT_DIM_FULL == 29
33
+ assert lean.shape[1] == hsl.FEAT_DIM == 21
34
+ assert torch.allclose(lean, full[:, 8:], atol=1e-6) # the dropped 8 are the raw bits
35
+
36
+
37
+ def test_delta_is_lossless_core():
38
+ # Δ-from-origin-0 integrates back to the bitstream -> bits channel is redundant
39
+ fr = hsl.encode(b"redundancy?")
40
+ rec = hsl._integrate(fr.delta, origin=hsl.ORIGIN_BIT)
41
+ assert (rec == fr.signal).all()
42
+
43
+
44
+ def test_module_and_pack():
45
+ emb = hsl.Embedding() # default = 21-D
46
+ assert emb.out_dim == 21
47
+ assert emb(b"test").shape == (4, 21)
48
+ assert hsl.Embedding(include_bits=True).out_dim == 29
49
+ feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8)
50
+ assert feats.shape == (2, 8, 21)
51
+ assert mask.sum(1).tolist() == [1.0, 6.0]
52
+
53
+
54
+ def test_empty_input_is_safe():
55
+ feats, _ = hsl.embed(b"")
56
+ assert feats.shape[0] >= 1 # empty -> treated as a single zero byte, never crashes
57
+
58
+
59
+ def test_feat_names():
60
+ assert len(hsl.feat_names(True)) == 29
61
+ assert len(hsl.feat_names(False)) == 21
62
+ assert hsl.feat_names(False)[0] == "dxor0" # change-rate is the first channel of the lean substrate