hypernix 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hypernix/__init__.py +68 -0
- hypernix/__main__.py +4 -0
- hypernix/arch.py +174 -0
- hypernix/cli.py +762 -0
- hypernix/convert.py +277 -0
- hypernix/doctor.py +113 -0
- hypernix/download.py +254 -0
- hypernix/fetcher.py +259 -0
- hypernix/generate.py +136 -0
- hypernix/nano_nano.py +213 -0
- hypernix/old_oven.py +641 -0
- hypernix/py.typed +0 -0
- hypernix/quantize.py +285 -0
- hypernix/train.py +555 -0
- hypernix/upload.py +61 -0
- hypernix-0.32.0.dist-info/METADATA +353 -0
- hypernix-0.32.0.dist-info/RECORD +21 -0
- hypernix-0.32.0.dist-info/WHEEL +5 -0
- hypernix-0.32.0.dist-info/entry_points.txt +3 -0
- hypernix-0.32.0.dist-info/licenses/LICENSE +17 -0
- hypernix-0.32.0.dist-info/top_level.txt +1 -0
hypernix/__init__.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""HyperNix: convert ray0rf1re/hyper-nix.1 PyTorch weights to GGUF."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from . import old_oven
|
|
5
|
+
from .convert import convert_to_gguf
|
|
6
|
+
from .download import (
|
|
7
|
+
KNOWN_MODELS,
|
|
8
|
+
ModelInfo,
|
|
9
|
+
download_model,
|
|
10
|
+
resolve_model_info,
|
|
11
|
+
resolve_repo_id,
|
|
12
|
+
verify_snapshot,
|
|
13
|
+
)
|
|
14
|
+
from .fetcher import fetch_llama_quantize
|
|
15
|
+
from .generate import generate_text
|
|
16
|
+
from .old_oven import (
|
|
17
|
+
ARCH_PRESETS,
|
|
18
|
+
CodeOven,
|
|
19
|
+
bake_code,
|
|
20
|
+
fill_middle,
|
|
21
|
+
load_pt,
|
|
22
|
+
new_oven,
|
|
23
|
+
preheat,
|
|
24
|
+
)
|
|
25
|
+
from .quantize import QUANT_TYPES, quantize_gguf
|
|
26
|
+
from .train import (
|
|
27
|
+
HyperNixConfig,
|
|
28
|
+
HyperNixModel,
|
|
29
|
+
expand_checkpoint,
|
|
30
|
+
init_from_scratch,
|
|
31
|
+
load_snapshot,
|
|
32
|
+
save_snapshot,
|
|
33
|
+
train,
|
|
34
|
+
)
|
|
35
|
+
from .upload import upload_gguf
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"ARCH_PRESETS",
|
|
39
|
+
"CodeOven",
|
|
40
|
+
"HyperNixConfig",
|
|
41
|
+
"HyperNixModel",
|
|
42
|
+
"KNOWN_MODELS",
|
|
43
|
+
"ModelInfo",
|
|
44
|
+
"QUANT_TYPES",
|
|
45
|
+
"bake_code",
|
|
46
|
+
"convert_to_gguf",
|
|
47
|
+
"download_model",
|
|
48
|
+
"expand_checkpoint",
|
|
49
|
+
"fetch_llama_quantize",
|
|
50
|
+
"fill_middle",
|
|
51
|
+
"generate_text",
|
|
52
|
+
"init_from_scratch",
|
|
53
|
+
"load_pt",
|
|
54
|
+
"load_snapshot",
|
|
55
|
+
"new_oven",
|
|
56
|
+
"old_oven",
|
|
57
|
+
"preheat",
|
|
58
|
+
"quantize_gguf",
|
|
59
|
+
"resolve_model_info",
|
|
60
|
+
"resolve_repo_id",
|
|
61
|
+
"save_snapshot",
|
|
62
|
+
"train",
|
|
63
|
+
"upload_gguf",
|
|
64
|
+
"verify_snapshot",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
__version__ = "0.32.0"
|
|
68
|
+
DEFAULT_REPO_ID = "ray0rf1re/hyper-nix.1"
|
hypernix/__main__.py
ADDED
hypernix/arch.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Architecture-agnostic tensor name mapping for custom HyperNix-style models.
|
|
2
|
+
|
|
3
|
+
The HyperNix family is described upstream as a "custom architecture" causal LM
|
|
4
|
+
without a fixed ``transformers`` class. We therefore avoid hard-coding layer
|
|
5
|
+
counts, hidden sizes, or attention-head counts: every parameter is introspected
|
|
6
|
+
from the state dict and remapped onto llama.cpp's canonical GGUF tensor names
|
|
7
|
+
when a recognizable pattern is found.
|
|
8
|
+
|
|
9
|
+
Tensors that do not match a known pattern are still emitted under their
|
|
10
|
+
original name so downstream tooling can round-trip arbitrarily shaped models.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from collections.abc import Iterable
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
|
|
18
|
+
# Canonical GGUF tensor names (see gguf.constants.MODEL_TENSOR).
|
|
19
|
+
TOK_EMBD = "token_embd.weight"
|
|
20
|
+
OUTPUT_NORM = "output_norm.weight"
|
|
21
|
+
OUTPUT = "output.weight"
|
|
22
|
+
|
|
23
|
+
# Per-block templates.
|
|
24
|
+
BLK = "blk.{i}."
|
|
25
|
+
ATTN_NORM = BLK + "attn_norm.weight"
|
|
26
|
+
ATTN_Q = BLK + "attn_q.weight"
|
|
27
|
+
ATTN_K = BLK + "attn_k.weight"
|
|
28
|
+
ATTN_V = BLK + "attn_v.weight"
|
|
29
|
+
ATTN_QKV = BLK + "attn_qkv.weight"
|
|
30
|
+
ATTN_OUT = BLK + "attn_output.weight"
|
|
31
|
+
FFN_NORM = BLK + "ffn_norm.weight"
|
|
32
|
+
FFN_GATE = BLK + "ffn_gate.weight"
|
|
33
|
+
FFN_UP = BLK + "ffn_up.weight"
|
|
34
|
+
FFN_DOWN = BLK + "ffn_down.weight"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Regex patterns for common naming conventions we've encountered across PyTorch
|
|
38
|
+
# reference implementations (HF style, nanoGPT style, llama-style, gpt-neox).
|
|
39
|
+
_LAYER_PREFIXES = [
|
|
40
|
+
r"model\.layers\.(?P<i>\d+)\.",
|
|
41
|
+
r"transformer\.h\.(?P<i>\d+)\.",
|
|
42
|
+
r"layers\.(?P<i>\d+)\.",
|
|
43
|
+
r"blocks\.(?P<i>\d+)\.",
|
|
44
|
+
r"block\.(?P<i>\d+)\.",
|
|
45
|
+
r"h\.(?P<i>\d+)\.",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# Tail regex (without layer prefix) -> canonical template.
|
|
49
|
+
_PER_BLOCK_RULES: list[tuple[str, str]] = [
|
|
50
|
+
# norms
|
|
51
|
+
(r"input_layernorm\.weight$", ATTN_NORM),
|
|
52
|
+
(r"attention_norm\.weight$", ATTN_NORM),
|
|
53
|
+
(r"ln_1\.weight$", ATTN_NORM),
|
|
54
|
+
(r"norm1\.weight$", ATTN_NORM),
|
|
55
|
+
(r"attn_norm\.weight$", ATTN_NORM),
|
|
56
|
+
(r"post_attention_layernorm\.weight$", FFN_NORM),
|
|
57
|
+
(r"ffn_norm\.weight$", FFN_NORM),
|
|
58
|
+
(r"ln_2\.weight$", FFN_NORM),
|
|
59
|
+
(r"norm2\.weight$", FFN_NORM),
|
|
60
|
+
# attention projections (separate q/k/v)
|
|
61
|
+
(r"(?:self_attn|attention|attn)\.q_proj\.weight$", ATTN_Q),
|
|
62
|
+
(r"(?:self_attn|attention|attn)\.k_proj\.weight$", ATTN_K),
|
|
63
|
+
(r"(?:self_attn|attention|attn)\.v_proj\.weight$", ATTN_V),
|
|
64
|
+
(r"(?:self_attn|attention|attn)\.wq\.weight$", ATTN_Q),
|
|
65
|
+
(r"(?:self_attn|attention|attn)\.wk\.weight$", ATTN_K),
|
|
66
|
+
(r"(?:self_attn|attention|attn)\.wv\.weight$", ATTN_V),
|
|
67
|
+
# fused qkv
|
|
68
|
+
(r"(?:self_attn|attention|attn)\.(?:qkv_proj|Wqkv|qkv|c_attn)\.weight$", ATTN_QKV),
|
|
69
|
+
# attention output
|
|
70
|
+
(r"(?:self_attn|attention|attn)\.(?:o_proj|out_proj|wo|c_proj|dense)\.weight$", ATTN_OUT),
|
|
71
|
+
# MLP
|
|
72
|
+
(r"(?:mlp|feed_forward|ffn)\.(?:gate_proj|w1)\.weight$", FFN_GATE),
|
|
73
|
+
(r"(?:mlp|feed_forward|ffn)\.(?:up_proj|w3|c_fc|fc_in|fc1)\.weight$", FFN_UP),
|
|
74
|
+
(r"(?:mlp|feed_forward|ffn)\.(?:down_proj|w2|c_proj|fc_out|fc2)\.weight$", FFN_DOWN),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
# Top-level (non-per-block) rules.
|
|
78
|
+
_TOP_LEVEL_RULES: list[tuple[str, str]] = [
|
|
79
|
+
(r"^(?:model\.)?(?:tok_embeddings|embed_tokens|wte|embeddings?\.word_embeddings)\.weight$", TOK_EMBD),
|
|
80
|
+
(r"^(?:model\.)?(?:norm|ln_f|final_layernorm|output_norm)\.weight$", OUTPUT_NORM),
|
|
81
|
+
(r"^(?:lm_head|output|embed_out)\.weight$", OUTPUT),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class ArchInfo:
|
|
87
|
+
"""Dimensions inferred from the state dict."""
|
|
88
|
+
|
|
89
|
+
n_layers: int = 0
|
|
90
|
+
n_embd: int = 0
|
|
91
|
+
n_head: int = 0
|
|
92
|
+
n_head_kv: int = 0
|
|
93
|
+
n_ff: int = 0
|
|
94
|
+
vocab_size: int = 0
|
|
95
|
+
layer_indices: list[int] = field(default_factory=list)
|
|
96
|
+
tied_embeddings: bool = False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _match_layer_index(name: str) -> tuple[int, str] | None:
|
|
100
|
+
for pat in _LAYER_PREFIXES:
|
|
101
|
+
m = re.match(pat, name)
|
|
102
|
+
if m:
|
|
103
|
+
return int(m.group("i")), name[m.end() :]
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def map_tensor_name(name: str) -> str | None:
|
|
108
|
+
"""Map a PyTorch parameter name to a GGUF tensor name.
|
|
109
|
+
|
|
110
|
+
Returns ``None`` if no canonical mapping applies — the caller may still
|
|
111
|
+
emit the tensor under its original name.
|
|
112
|
+
"""
|
|
113
|
+
for pat, canonical in _TOP_LEVEL_RULES:
|
|
114
|
+
if re.match(pat, name):
|
|
115
|
+
return canonical
|
|
116
|
+
hit = _match_layer_index(name)
|
|
117
|
+
if hit is None:
|
|
118
|
+
return None
|
|
119
|
+
idx, tail = hit
|
|
120
|
+
for pat, template in _PER_BLOCK_RULES:
|
|
121
|
+
if re.fullmatch(pat, tail):
|
|
122
|
+
return template.format(i=idx)
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def infer_arch(
|
|
127
|
+
state_dict: dict[str, object],
|
|
128
|
+
hint_n_head: int | None = None,
|
|
129
|
+
) -> ArchInfo:
|
|
130
|
+
"""Inspect tensor shapes to infer basic architectural dimensions.
|
|
131
|
+
|
|
132
|
+
Works for any layer count / hidden size. Heads default to a sensible guess
|
|
133
|
+
if not discoverable (hidden_size // 64, clamped to >= 1).
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
info = ArchInfo()
|
|
137
|
+
layer_ids: set[int] = set()
|
|
138
|
+
ffn_dim: int | None = None
|
|
139
|
+
hidden: int | None = None
|
|
140
|
+
vocab: int | None = None
|
|
141
|
+
|
|
142
|
+
for name, tensor in state_dict.items():
|
|
143
|
+
if not hasattr(tensor, "shape"):
|
|
144
|
+
continue
|
|
145
|
+
shape = tuple(tensor.shape)
|
|
146
|
+
hit = _match_layer_index(name)
|
|
147
|
+
if hit is not None:
|
|
148
|
+
layer_ids.add(hit[0])
|
|
149
|
+
lower = name.lower()
|
|
150
|
+
if "embed" in lower and "weight" in lower and len(shape) == 2:
|
|
151
|
+
vocab, hidden = shape[0], shape[1]
|
|
152
|
+
elif lower.endswith("lm_head.weight") and len(shape) == 2:
|
|
153
|
+
vocab = vocab or shape[0]
|
|
154
|
+
hidden = hidden or shape[1]
|
|
155
|
+
if len(shape) == 2 and ("gate_proj" in lower or "up_proj" in lower or ".w1." in lower or ".w3." in lower or "fc1" in lower or "fc_in" in lower):
|
|
156
|
+
ffn_dim = max(ffn_dim or 0, shape[0])
|
|
157
|
+
|
|
158
|
+
info.layer_indices = sorted(layer_ids)
|
|
159
|
+
info.n_layers = (max(layer_ids) + 1) if layer_ids else 0
|
|
160
|
+
info.n_embd = int(hidden or 0)
|
|
161
|
+
info.vocab_size = int(vocab or 0)
|
|
162
|
+
info.n_ff = int(ffn_dim or (4 * info.n_embd if info.n_embd else 0))
|
|
163
|
+
if hint_n_head is not None and hint_n_head > 0:
|
|
164
|
+
info.n_head = hint_n_head
|
|
165
|
+
elif info.n_embd:
|
|
166
|
+
guess = info.n_embd // 64
|
|
167
|
+
info.n_head = max(1, guess)
|
|
168
|
+
info.n_head_kv = info.n_head
|
|
169
|
+
return info
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def iter_state_dict_names(state_dict_keys: Iterable[str]) -> list[str]:
|
|
173
|
+
"""Return the list of keys in stable sorted order for deterministic output."""
|
|
174
|
+
return sorted(state_dict_keys)
|