hypernix 0.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hypernix/__init__.py ADDED
@@ -0,0 +1,68 @@
1
+ """HyperNix: convert ray0rf1re/hyper-nix.1 PyTorch weights to GGUF."""
2
+ from __future__ import annotations
3
+
4
+ from . import old_oven
5
+ from .convert import convert_to_gguf
6
+ from .download import (
7
+ KNOWN_MODELS,
8
+ ModelInfo,
9
+ download_model,
10
+ resolve_model_info,
11
+ resolve_repo_id,
12
+ verify_snapshot,
13
+ )
14
+ from .fetcher import fetch_llama_quantize
15
+ from .generate import generate_text
16
+ from .old_oven import (
17
+ ARCH_PRESETS,
18
+ CodeOven,
19
+ bake_code,
20
+ fill_middle,
21
+ load_pt,
22
+ new_oven,
23
+ preheat,
24
+ )
25
+ from .quantize import QUANT_TYPES, quantize_gguf
26
+ from .train import (
27
+ HyperNixConfig,
28
+ HyperNixModel,
29
+ expand_checkpoint,
30
+ init_from_scratch,
31
+ load_snapshot,
32
+ save_snapshot,
33
+ train,
34
+ )
35
+ from .upload import upload_gguf
36
+
37
+ __all__ = [
38
+ "ARCH_PRESETS",
39
+ "CodeOven",
40
+ "HyperNixConfig",
41
+ "HyperNixModel",
42
+ "KNOWN_MODELS",
43
+ "ModelInfo",
44
+ "QUANT_TYPES",
45
+ "bake_code",
46
+ "convert_to_gguf",
47
+ "download_model",
48
+ "expand_checkpoint",
49
+ "fetch_llama_quantize",
50
+ "fill_middle",
51
+ "generate_text",
52
+ "init_from_scratch",
53
+ "load_pt",
54
+ "load_snapshot",
55
+ "new_oven",
56
+ "old_oven",
57
+ "preheat",
58
+ "quantize_gguf",
59
+ "resolve_model_info",
60
+ "resolve_repo_id",
61
+ "save_snapshot",
62
+ "train",
63
+ "upload_gguf",
64
+ "verify_snapshot",
65
+ ]
66
+
67
+ __version__ = "0.31.1"
68
+ DEFAULT_REPO_ID = "ray0rf1re/hyper-nix.1"
hypernix/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
hypernix/arch.py ADDED
@@ -0,0 +1,174 @@
1
+ """Architecture-agnostic tensor name mapping for custom HyperNix-style models.
2
+
3
+ The HyperNix family is described upstream as a "custom architecture" causal LM
4
+ without a fixed ``transformers`` class. We therefore avoid hard-coding layer
5
+ counts, hidden sizes, or attention-head counts: every parameter is introspected
6
+ from the state dict and remapped onto llama.cpp's canonical GGUF tensor names
7
+ when a recognizable pattern is found.
8
+
9
+ Tensors that do not match a known pattern are still emitted under their
10
+ original name so downstream tooling can round-trip arbitrarily shaped models.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from collections.abc import Iterable
16
+ from dataclasses import dataclass, field
17
+
18
+ # Canonical GGUF tensor names (see gguf.constants.MODEL_TENSOR).
19
+ TOK_EMBD = "token_embd.weight"
20
+ OUTPUT_NORM = "output_norm.weight"
21
+ OUTPUT = "output.weight"
22
+
23
+ # Per-block templates.
24
+ BLK = "blk.{i}."
25
+ ATTN_NORM = BLK + "attn_norm.weight"
26
+ ATTN_Q = BLK + "attn_q.weight"
27
+ ATTN_K = BLK + "attn_k.weight"
28
+ ATTN_V = BLK + "attn_v.weight"
29
+ ATTN_QKV = BLK + "attn_qkv.weight"
30
+ ATTN_OUT = BLK + "attn_output.weight"
31
+ FFN_NORM = BLK + "ffn_norm.weight"
32
+ FFN_GATE = BLK + "ffn_gate.weight"
33
+ FFN_UP = BLK + "ffn_up.weight"
34
+ FFN_DOWN = BLK + "ffn_down.weight"
35
+
36
+
37
+ # Regex patterns for common naming conventions we've encountered across PyTorch
38
+ # reference implementations (HF style, nanoGPT style, llama-style, gpt-neox).
39
+ _LAYER_PREFIXES = [
40
+ r"model\.layers\.(?P<i>\d+)\.",
41
+ r"transformer\.h\.(?P<i>\d+)\.",
42
+ r"layers\.(?P<i>\d+)\.",
43
+ r"blocks\.(?P<i>\d+)\.",
44
+ r"block\.(?P<i>\d+)\.",
45
+ r"h\.(?P<i>\d+)\.",
46
+ ]
47
+
48
+ # Tail regex (without layer prefix) -> canonical template.
49
+ _PER_BLOCK_RULES: list[tuple[str, str]] = [
50
+ # norms
51
+ (r"input_layernorm\.weight$", ATTN_NORM),
52
+ (r"attention_norm\.weight$", ATTN_NORM),
53
+ (r"ln_1\.weight$", ATTN_NORM),
54
+ (r"norm1\.weight$", ATTN_NORM),
55
+ (r"attn_norm\.weight$", ATTN_NORM),
56
+ (r"post_attention_layernorm\.weight$", FFN_NORM),
57
+ (r"ffn_norm\.weight$", FFN_NORM),
58
+ (r"ln_2\.weight$", FFN_NORM),
59
+ (r"norm2\.weight$", FFN_NORM),
60
+ # attention projections (separate q/k/v)
61
+ (r"(?:self_attn|attention|attn)\.q_proj\.weight$", ATTN_Q),
62
+ (r"(?:self_attn|attention|attn)\.k_proj\.weight$", ATTN_K),
63
+ (r"(?:self_attn|attention|attn)\.v_proj\.weight$", ATTN_V),
64
+ (r"(?:self_attn|attention|attn)\.wq\.weight$", ATTN_Q),
65
+ (r"(?:self_attn|attention|attn)\.wk\.weight$", ATTN_K),
66
+ (r"(?:self_attn|attention|attn)\.wv\.weight$", ATTN_V),
67
+ # fused qkv
68
+ (r"(?:self_attn|attention|attn)\.(?:qkv_proj|Wqkv|qkv|c_attn)\.weight$", ATTN_QKV),
69
+ # attention output
70
+ (r"(?:self_attn|attention|attn)\.(?:o_proj|out_proj|wo|c_proj|dense)\.weight$", ATTN_OUT),
71
+ # MLP
72
+ (r"(?:mlp|feed_forward|ffn)\.(?:gate_proj|w1)\.weight$", FFN_GATE),
73
+ (r"(?:mlp|feed_forward|ffn)\.(?:up_proj|w3|c_fc|fc_in|fc1)\.weight$", FFN_UP),
74
+ (r"(?:mlp|feed_forward|ffn)\.(?:down_proj|w2|c_proj|fc_out|fc2)\.weight$", FFN_DOWN),
75
+ ]
76
+
77
+ # Top-level (non-per-block) rules.
78
+ _TOP_LEVEL_RULES: list[tuple[str, str]] = [
79
+ (r"^(?:model\.)?(?:tok_embeddings|embed_tokens|wte|embeddings?\.word_embeddings)\.weight$", TOK_EMBD),
80
+ (r"^(?:model\.)?(?:norm|ln_f|final_layernorm|output_norm)\.weight$", OUTPUT_NORM),
81
+ (r"^(?:lm_head|output|embed_out)\.weight$", OUTPUT),
82
+ ]
83
+
84
+
85
+ @dataclass
86
+ class ArchInfo:
87
+ """Dimensions inferred from the state dict."""
88
+
89
+ n_layers: int = 0
90
+ n_embd: int = 0
91
+ n_head: int = 0
92
+ n_head_kv: int = 0
93
+ n_ff: int = 0
94
+ vocab_size: int = 0
95
+ layer_indices: list[int] = field(default_factory=list)
96
+ tied_embeddings: bool = False
97
+
98
+
99
+ def _match_layer_index(name: str) -> tuple[int, str] | None:
100
+ for pat in _LAYER_PREFIXES:
101
+ m = re.match(pat, name)
102
+ if m:
103
+ return int(m.group("i")), name[m.end() :]
104
+ return None
105
+
106
+
107
+ def map_tensor_name(name: str) -> str | None:
108
+ """Map a PyTorch parameter name to a GGUF tensor name.
109
+
110
+ Returns ``None`` if no canonical mapping applies — the caller may still
111
+ emit the tensor under its original name.
112
+ """
113
+ for pat, canonical in _TOP_LEVEL_RULES:
114
+ if re.match(pat, name):
115
+ return canonical
116
+ hit = _match_layer_index(name)
117
+ if hit is None:
118
+ return None
119
+ idx, tail = hit
120
+ for pat, template in _PER_BLOCK_RULES:
121
+ if re.fullmatch(pat, tail):
122
+ return template.format(i=idx)
123
+ return None
124
+
125
+
126
+ def infer_arch(
127
+ state_dict: dict[str, object],
128
+ hint_n_head: int | None = None,
129
+ ) -> ArchInfo:
130
+ """Inspect tensor shapes to infer basic architectural dimensions.
131
+
132
+ Works for any layer count / hidden size. Heads default to a sensible guess
133
+ if not discoverable (hidden_size // 64, clamped to >= 1).
134
+ """
135
+
136
+ info = ArchInfo()
137
+ layer_ids: set[int] = set()
138
+ ffn_dim: int | None = None
139
+ hidden: int | None = None
140
+ vocab: int | None = None
141
+
142
+ for name, tensor in state_dict.items():
143
+ if not hasattr(tensor, "shape"):
144
+ continue
145
+ shape = tuple(tensor.shape)
146
+ hit = _match_layer_index(name)
147
+ if hit is not None:
148
+ layer_ids.add(hit[0])
149
+ lower = name.lower()
150
+ if "embed" in lower and "weight" in lower and len(shape) == 2:
151
+ vocab, hidden = shape[0], shape[1]
152
+ elif lower.endswith("lm_head.weight") and len(shape) == 2:
153
+ vocab = vocab or shape[0]
154
+ hidden = hidden or shape[1]
155
+ if len(shape) == 2 and ("gate_proj" in lower or "up_proj" in lower or ".w1." in lower or ".w3." in lower or "fc1" in lower or "fc_in" in lower):
156
+ ffn_dim = max(ffn_dim or 0, shape[0])
157
+
158
+ info.layer_indices = sorted(layer_ids)
159
+ info.n_layers = (max(layer_ids) + 1) if layer_ids else 0
160
+ info.n_embd = int(hidden or 0)
161
+ info.vocab_size = int(vocab or 0)
162
+ info.n_ff = int(ffn_dim or (4 * info.n_embd if info.n_embd else 0))
163
+ if hint_n_head is not None and hint_n_head > 0:
164
+ info.n_head = hint_n_head
165
+ elif info.n_embd:
166
+ guess = info.n_embd // 64
167
+ info.n_head = max(1, guess)
168
+ info.n_head_kv = info.n_head
169
+ return info
170
+
171
+
172
+ def iter_state_dict_names(state_dict_keys: Iterable[str]) -> list[str]:
173
+ """Return the list of keys in stable sorted order for deterministic output."""
174
+ return sorted(state_dict_keys)