PyPI - hsl-embedding - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hsl-embedding 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

hsl_embedding/__init__.py +190 -0
hsl_embedding-0.1.0.dist-info/METADATA +162 -0
hsl_embedding-0.1.0.dist-info/RECORD +5 -0
hsl_embedding-0.1.0.dist-info/WHEEL +4 -0
hsl_embedding-0.1.0.dist-info/licenses/LICENSE +21 -0

hsl_embedding/__init__.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""HSL — Holistic Signal Language: a non-learned, byte-level signal embedding (codec + encoder in one).
+Everything is information — a fluctuation between 0 and 1. HSL turns raw bytes into a compact,
+*change-rate-based* feature signal that any modality (text, image, audio, video, sensor) shares,
+with no tokenizer and no learned parameters. The representation is grounded in a lossless codec,
+so `decode(encode(x)) == x` — the substrate is byte-exact by construction.
+The 21-D per-byte feature (FEAT_DIM) — the pure change-rate substrate:
+    dxor0..7   (8)  Δ  change-rate  — XOR-delta from origin 0  (losslessly encodes the bytes)
+    d2xor0..7  (8)  Δ² change-rate-of-change-rate — 2nd XOR-delta
+    boundary   (1)  byte-boundary evidence (|Δ| + 0.5|Δ²| + 0.25·HF)
+    fft_low/high (2) per-byte spectral amplitude bands
+    phase_cos/sin (2) exact complex phasor  z = e^{iθ}, θ = 2π·byte/256
+The raw 8 bits are NOT included by default: Δ-from-origin-0 already encodes the bytes losslessly,
+so the bits are redundant. Pass include_bits=True for the legacy 29-D (raw bits prepended).
+    import hsl_embedding as hsl
+    feats, phase = hsl.embed(b"hello")     # [L, 21], [L]
+    emb = hsl.Embedding();  feats = emb(b"hello")
+    assert hsl.decode(hsl.encode(b"hello")) == b"hello"
+Author: Jinhyun Woo (ggunio5782@gmail.com). MIT-licensed; no learned weights included.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Iterable
+import numpy as np
+import torch
+import torch.nn as nn
+__all__ = ["FEAT_DIM", "FEAT_DIM_FULL", "FEAT_NAMES", "FEAT_NAMES_FULL", "feat_names",
+           "ORIGIN_BIT", "CLOSURE_BIT", "HSLFrame", "encode", "decode", "embed", "Embedding"]
+ORIGIN_BIT = 0          # the "0": origin enabling lossless reconstruction
+CLOSURE_BIT = 1         # the "1": learned closure / end-of-content endpoint
+FEAT_DIM = 21           # default: the pure change-rate substrate (Δ, Δ², boundary, Fourier, phase)
+FEAT_DIM_FULL = 29      # include_bits=True: also prepend the 8 raw bits (redundant with Δ; legacy compat)
+_BITS = [f"bit{i}" for i in range(8)]
+_REST = ([f"dxor{i}" for i in range(8)]      # Δ
+         + [f"d2xor{i}" for i in range(8)]   # Δ²
+         + ["boundary", "fft_low_ratio", "fft_high_ratio", "phase_cos", "phase_sin"])
+FEAT_NAMES = list(_REST)                      # 21 (default — change-rate substrate)
+FEAT_NAMES_FULL = _BITS + _REST               # 29 (with raw bits)
+def feat_names(include_bits: bool = False):
+    return (_BITS + _REST) if include_bits else list(_REST)
+# ───────────────────────────── codec (numpy, lossless) ─────────────────────────────
+@dataclass(frozen=True)
+class HSLFrame:
+    payload_len_bytes: int
+    bits: np.ndarray
+    signal: np.ndarray         # bits + closure (the 0 → … → 1 journey)
+    delta: np.ndarray          # Δ  (XOR-delta from origin 0)
+    delta2: np.ndarray         # Δ²
+    byte_boundary_score: np.ndarray
+def _bytes_to_bits(data: bytes) -> np.ndarray:
+    if not data:
+        return np.zeros((0,), dtype=np.uint8)
+    return np.unpackbits(np.frombuffer(data, dtype=np.uint8), bitorder="big").astype(np.uint8)
+def _bits_to_bytes(bits: np.ndarray, n: int) -> bytes:
+    need = n * 8
+    b = np.asarray(bits[:need], dtype=np.uint8)
+    if b.size != need:
+        raise ValueError(f"not enough bits: have {b.size}, need {need}")
+    return np.packbits(b, bitorder="big")[:n].tobytes() if b.size else b""
+def _xor_delta(bits: np.ndarray, origin: int = ORIGIN_BIT) -> np.ndarray:
+    bits = np.asarray(bits, dtype=np.uint8)
+    prev = np.empty_like(bits)
+    if bits.size:
+        prev[0] = origin
+        prev[1:] = bits[:-1]
+    return np.bitwise_xor(bits, prev).astype(np.uint8)
+def _integrate(delta: Iterable[int], origin: int = ORIGIN_BIT) -> np.ndarray:
+    out, prev = [], np.uint8(origin)
+    for v in delta:
+        prev = np.uint8(v) ^ prev
+        out.append(prev)
+    return np.asarray(out, dtype=np.uint8)
+def _hf_energy(values: np.ndarray, radius: int = 4) -> np.ndarray:
+    values = np.asarray(values, dtype=np.float32)
+    if values.size == 0:
+        return values
+    k = np.ones((radius * 2 + 1,), dtype=np.float32) / float(radius * 2 + 1)
+    return np.abs(values - np.convolve(values, k, mode="same")).astype(np.float32)
+def _byte_boundary(signal: np.ndarray) -> np.ndarray:
+    """vectorized per-byte boundary score from transition energy (no UTF-8 decoding)."""
+    if signal.size <= 1:
+        return np.zeros((0,), dtype=np.float32)
+    nbytes = (signal.size - 1) // 8
+    if nbytes == 0:
+        return np.zeros((0,), dtype=np.float32)
+    energy = _xor_delta(signal).astype(np.float32) + 0.5 * _xor_delta(_xor_delta(signal)).astype(np.float32)
+    starts = np.arange(nbytes) * 8
+    lo = np.maximum(0, starts - 4)
+    hi = np.minimum(energy.size, starts + 5)
+    csum = np.concatenate([[0.0], np.cumsum(energy)])
+    return ((csum[hi] - csum[lo]) / np.maximum(hi - lo, 1)).astype(np.float32)
+def encode(data: bytes) -> HSLFrame:
+    """bytes → HSLFrame (lossless; carries the bitstream, Δ, Δ²)."""
+    bits = _bytes_to_bits(data)
+    signal = np.concatenate([bits, np.asarray([CLOSURE_BIT], dtype=np.uint8)])
+    delta = _xor_delta(signal)
+    return HSLFrame(len(data), bits, signal, delta, _xor_delta(delta), _byte_boundary(signal))
+def decode(frame: HSLFrame) -> bytes:
+    """HSLFrame → bytes (integrate Δ from origin 0, check closure, drop it)."""
+    signal = _integrate(frame.delta, origin=ORIGIN_BIT)
+    need = frame.payload_len_bytes * 8 + 1
+    if signal.size < need or int(signal[need - 1]) != CLOSURE_BIT:
+        raise ValueError("closure / length check failed")
+    return _bits_to_bytes(signal[: need - 1], frame.payload_len_bytes)
+# ───────────────────────────── embedding (torch) ─────────────────────────────
+def embed(data: bytes, include_bits: bool = False):
+    """bytes → (feats [L, 21|29], phase [L]). Deterministic, non-learned, pure change-rate.
+    include_bits=False (default, 21-D): the change-rate substrate — Δ, Δ², boundary, Fourier, phase.
+    The raw byte bits are dropped because Δ-from-origin-0 already encodes them losslessly.
+    include_bits=True (29-D): also prepend the 8 raw bits — for the original trained HoLo model.
+    """
+    if len(data) == 0:
+        data = b"\x00"
+    fr = encode(data)
+    bc = fr.payload_len_bytes
+    bits = torch.from_numpy(fr.bits[: bc * 8].reshape(bc, 8).astype(np.float32))
+    dxor = torch.from_numpy(fr.delta[: bc * 8].reshape(bc, 8).astype(np.float32))     # Δ
+    d2xor = torch.from_numpy(fr.delta2[: bc * 8].reshape(bc, 8).astype(np.float32))   # Δ²
+    boundary = torch.from_numpy(fr.byte_boundary_score.reshape(bc, 1).astype(np.float32))
+    spec = torch.fft.rfft(bits, dim=1).abs()                       # per-byte 8-bit spectrum
+    low, high = spec[:, :3].sum(1, keepdim=True), spec[:, 3:].sum(1, keepdim=True)
+    fourier = torch.cat([low, high], dim=1) / (low + high + 1e-6)
+    a = torch.from_numpy(np.frombuffer(data, dtype=np.uint8).astype(np.float32))
+    angle = a / 256.0 * (2.0 * math.pi)                            # exact phase θ
+    phasor = torch.stack([torch.cos(angle), torch.sin(angle)], dim=1)
+    parts = [bits, dxor, d2xor, boundary, fourier, phasor] if include_bits else [dxor, d2xor, boundary, fourier, phasor]
+    return torch.cat(parts, dim=1), angle      # [bc, 29] or [bc, 21]
+class Embedding(nn.Module):
+    """HSL byte → signal embedding as an nn.Module (no parameters), usable like nn.Embedding.
+        self.hsl = hsl.Embedding()
+        feats = self.hsl(b"...")            # [L, 21]  (include_bits=True -> [L, 29])
+        feats, phase = self.hsl(b"...", return_phase=True)
+    """
+    def __init__(self, include_bits: bool = False):
+        super().__init__()
+        self.include_bits = include_bits
+        self.out_dim = FEAT_DIM_FULL if include_bits else FEAT_DIM
+    def forward(self, data: bytes, return_phase: bool = False):
+        feats, phase = embed(data, self.include_bits)
+        return (feats, phase) if return_phase else feats
+    def pack(self, byte_list: list[bytes], max_len: int):
+        """list[bytes] → feats[B,L,out_dim], phase[B,L], mask[B,L] (pad/truncate to max_len)."""
+        B = len(byte_list)
+        feats = torch.zeros(B, max_len, self.out_dim)
+        phase = torch.zeros(B, max_len)
+        mask = torch.zeros(B, max_len)
+        for i, data in enumerate(byte_list):
+            f, p = embed(data, self.include_bits)
+            n = min(f.shape[0], max_len)
+            feats[i, :n], phase[i, :n], mask[i, :n] = f[:n], p[:n], 1.0
+        return feats, phase, mask

hsl_embedding-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,162 @@
+Metadata-Version: 2.4
+Name: hsl-embedding
+Version: 0.1.0
+Summary: HSL (Holistic Signal Language): a non-learned, byte-level signal encoder for PyTorch — change-rate features, no tokenizer, losslessly invertible.
+Project-URL: Homepage, https://github.com/Woojiggun/holo-hsl
+Project-URL: Paper, https://doi.org/10.5281/zenodo.20581805
+Project-URL: Demo, https://holo-demo-p5txmh4dda-as.a.run.app
+Author-email: Jinhyun Woo <ggunio5782@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: byte-native,change-rate,embedding,multimodal,pytorch,signal,tokenizer-free
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Requires-Dist: numpy>=1.21
+Requires-Dist: torch>=1.12
+Description-Content-Type: text/markdown
+# HSL — Holistic Signal Language
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.20581805.svg)](https://doi.org/10.5281/zenodo.20581805)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
+**A non-learned, byte-level signal encoder for PyTorch.** Instead of splitting text into tokens, it reads
+raw bytes *holistically as signal*: bits, change-rate (Δ, XOR-delta), 2nd-order change (Δ²), boundary,
+Fourier bands, and exact complex phase — 29 dimensions per byte, losslessly invertible. One
+modality-agnostic input layer for text, image, audio, video — any byte stream.
+> Everything is information — a fluctuation between 0 and 1. HSL doesn't ask *what a token means*; it
+> measures *how the signal changes*, with exact formulas, so the same representation works under every modality.
+```python
+import hsl_embedding as hsl
+feats, phase = hsl.embed(b"hello")          # -> Tensor [L, 21], Tensor [L]
+emb = hsl.Embedding()                        # an nn.Module, no parameters (like nn.Embedding)
+feats = emb("강아지".encode())               # -> [L, 21]
+assert hsl.decode(hsl.encode(b"hello")) == b"hello"   # lossless, by construction
+```
+## Install
+```bash
+pip install hsl-embedding      # distribution name; import as `import hsl_embedding as hsl`
+# deps: numpy, torch
+```
+## Why not just `nn.Embedding`?
+They solve **different problems** — this is *not* a performance claim, it's a "when to use which".
+| | `torch.nn.Embedding` | `hsl.Embedding` |
+|---|---|---|
+| what it is | a **learned lookup table** (trainable params) | an **exact formula** (zero params, deterministic) |
+| input | a token id (`int`) | raw `bytes` |
+| needs | a tokenizer + fixed vocab + training data | nothing — works on any bytes, day one |
+| dimensions | opaque, learned | **named & interpretable** (Δ / Δ² / boundary / Fourier / phase) |
+| modality | one tokenizer per modality (text ≠ image ≠ audio) | **one substrate for all** (byte-native) |
+| invertible | no | **yes** (`decode(encode(x)) == x`) |
+| new scripts / formats | breaks / out-of-vocab | just bytes — never breaks |
+**They compose.** HSL is an *input substrate*, not a replacement for learned representations: `nn.Embedding`
+learns *what tokens mean*; HSL gives *exact structural signal* for free. Stack learned layers **on top** of
+HSL features.
+**Reach for HSL when** you want: tokenizer-free input · one model across modalities · structure/change-aware
+features · exact reconstruction · small-data or from-scratch training · interpretable input channels.
+## What each channel captures (and where it's good)
+HSL is built from **exact formulas**, each chosen to carry information a plain learned embedding tends to
+throw away. The default is **21-D** — the pure change-rate substrate, one row per channel:
+| channel (dims) | exact formula | captures | especially good for |
+|---|---|---|---|
+| **Δ** `dxor` 0–7 (8) | `XOR(bitₜ, bitₜ₋₁)` from origin 0 | **change / transitions** — *where the signal flips* | edges, topic/region shifts, the modality-shared "rate of change". *Measured: shift-detection AUC **0.725** vs content **0.698**.* |
+| **Δ²** `d2xor` 0–7 (8) | `XOR(Δₜ, Δₜ₋₁)` | **acceleration of change** (2nd order) — *편미분 경계* | sharp **boundaries / corners / onsets**; where the rate-of-change itself jumps (segment cuts, audio attacks, image corners) |
+| **boundary** (1) | `\|Δ\| + 0.5\|Δ²\| + 0.25·HF` | **transition-energy peaks** | **tokenizer-free segmentation** — natural byte/word/chunk cuts without decoding |
+| **Fourier** low/high (2) | per-byte 8-bit rFFT amplitude bands | **frequency / texture / periodicity** | smooth vs busy, periodic vs random — audio timbre, image texture, repetitive vs novel content |
+| **phase** cos/sin (2) | exact phasor `z = e^{iθ}, θ = 2π·byte/256` | **cyclic relation / angle** — exact `cos(θᵢ−θⱼ)` | **affect / mood** and relative/positional structure. *Measured: phase-variation tracks the audio affect-line **0.912**, better than loudness alone.* |
+The point: a single learned vector blurs all of this together. HSL keeps **change (Δ), curvature (Δ²),
+spectrum (Fourier), and phase** as separate, exact, interpretable channels — and adds them only where a
+modality needs them.
+*Legacy 29-D:* `include_bits=True` prepends the 8 raw byte bits. They're **redundant** (Δ-from-origin-0
+already encodes the bytes losslessly), included only to match the original trained HoLo model.
+## Lossless by construction
+The features are grounded in a lossless codec, so the substrate is byte-exact:
+```python
+frame = hsl.encode(b"any bytes \x00\xff")
+hsl.decode(frame) == b"any bytes \x00\xff"     # True
+```
+Δ-from-origin-0 *is* the codec's XOR-delta, so it already encodes the bytes losslessly — which is why the
+raw `bits` channel is redundant and can be dropped.
+## 21-D (default) vs 29-D (legacy)
+```python
+hsl.embed(data)                      # 21-D  (default; pure change-rate, no redundant bits)
+hsl.embed(data, include_bits=True)   # 29-D  (also prepend the 8 raw bits — original HoLo model)
+hsl.Embedding(include_bits=True).out_dim   # 29
+```
+## Batch
+```python
+emb = hsl.Embedding()
+feats, phase, mask = emb.pack([b"a", b"abcdef"], max_len=8)   # [B, L, D], [B, L], [B, L]
+```
+## Examples
+```bash
+python examples/quickstart.py        # bytes in, features out; named channels
+python examples/roundtrip_all.py     # text / image / audio / video -> embed -> EXACT reconstruction
+python examples/vs_nn_embedding.py   # nn.Embedding vs hsl.Embedding — when to use which
+python examples/benchmark_vs_nn.py   # honest capability + speed comparison
+```
+`roundtrip_all.py` — one modality-agnostic encoder, lossless by construction:
+```
+modality              bytes     feat shape   reconstruction
+----------------------------------------------------------------
+text  (utf-8)            98       (98, 21)   EXACT ✓
+image (RGB u8)         3072     (3072, 21)   EXACT ✓
+audio (PCM i16)        8000     (8000, 21)   EXACT ✓
+video (6 frames)       4608     (4608, 21)   EXACT ✓
+```
+## Scope (honest)
+HSL is a **non-learned input substrate** — a possibility-proof from an independent, single-GPU project, not a
+benchmark-beating system. It gives exact structural signal; the *meaning* still comes from a model you stack on
+top. See the paper and live demo:
+- 📄 Paper: [A Feasibility Study of Change-Rate-Based Multimodal Unification](https://doi.org/10.5281/zenodo.20581805) (Zenodo)
+- 🌐 Live demo: https://holo-demo-p5txmh4dda-as.a.run.app
+- 💻 HoLo project: https://github.com/Woojiggun/holo-hsl
+## License & citation
+**MIT License — © 2026 Jinhyun Woo (ggunio5782@gmail.com).**
+Free to use, modify, and **distribute, including for commercial use** — the only condition is that the
+copyright notice and attribution to **Jinhyun Woo** are kept. See [LICENSE](LICENSE).
+```bibtex
+@software{woo_hsl_2026,
+  author = {Jinhyun Woo},
+  title  = {HSL: a byte-native, modality-agnostic signal embedding},
+  year   = {2026},
+  doi    = {10.5281/zenodo.20581805},
+  url    = {https://github.com/Woojiggun/holo-hsl}
+}
+```

hsl_embedding-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+hsl_embedding/__init__.py,sha256=iZ6lUOGuAKHpJyFrbb8hIyzvjkPwcVrTYCiSRCyY2_w,9096
+hsl_embedding-0.1.0.dist-info/METADATA,sha256=6VbfCkXpZpZnsAutXsOEOLYBI3CjBOfgYQe_w9dLo38,8133
+hsl_embedding-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+hsl_embedding-0.1.0.dist-info/licenses/LICENSE,sha256=9lzK2PkfouwlysrhAYqYUVNv0BOPu0EzNyw_tA1R8mY,1068
+hsl_embedding-0.1.0.dist-info/RECORD,,

hsl_embedding-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

hsl_embedding-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Jinhyun Woo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.