amalia 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amalia-0.2.0.dist-info/METADATA +31 -0
- amalia-0.2.0.dist-info/RECORD +6 -0
- amalia-0.2.0.dist-info/WHEEL +4 -0
- amalia_core/__init__.py +4 -0
- amalia_core/architecture.py +134 -0
- amalia_core/config.py +19 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: amalia
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.14
|
|
6
|
+
Requires-Dist: numpy>=2.5.0
|
|
7
|
+
Requires-Dist: torch>=2.12.1
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# AMALIA
|
|
11
|
+
|
|
12
|
+
AMALIA is a decoder-only transformer architecture inherited from EuroLLM-9B, implemented in plain PyTorch (no external attention kernels).
|
|
13
|
+
|
|
14
|
+
## Usage in Google Colab
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
!pip install amalia
|
|
18
|
+
|
|
19
|
+
import torch
|
|
20
|
+
from amalia_core import AmaliaConfig, AmaliaForCausalLM
|
|
21
|
+
|
|
22
|
+
# Initialize the model with random weights
|
|
23
|
+
config = AmaliaConfig()
|
|
24
|
+
model = AmaliaForCausalLM(config)
|
|
25
|
+
|
|
26
|
+
# Run a forward pass on random token ids
|
|
27
|
+
input_ids = torch.randint(0, config.vocab_size, (1, 16))
|
|
28
|
+
logits = model(input_ids)
|
|
29
|
+
|
|
30
|
+
print(logits.shape) # torch.Size([1, 16, 128000])
|
|
31
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
amalia_core/__init__.py,sha256=un23WcEPxoRbJguj7UV_ID0C29H_aPw42-V7XKXXpug,126
|
|
2
|
+
amalia_core/architecture.py,sha256=G_bELz6Tz3_wjDgU0qfR2WZReJE46_RYpC4VKCk74LQ,5434
|
|
3
|
+
amalia_core/config.py,sha256=5m8KXw7WMoLt0LHJS0dxx3cYyiCZG-CiPnvEKhqJNiA,513
|
|
4
|
+
amalia-0.2.0.dist-info/METADATA,sha256=1aGOKUuyo7jCVSegurjfTiihPO-vIuBZIgPW57cR3OQ,766
|
|
5
|
+
amalia-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
6
|
+
amalia-0.2.0.dist-info/RECORD,,
|
amalia_core/__init__.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
import torch.nn.functional as F
|
|
4
|
+
|
|
5
|
+
from .config import AmaliaConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RMSNorm(nn.Module):
|
|
9
|
+
def __init__(self, hidden_size: int, eps: float):
|
|
10
|
+
super().__init__()
|
|
11
|
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
12
|
+
self.eps = eps
|
|
13
|
+
|
|
14
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
15
|
+
variance = x.pow(2).mean(dim=-1, keepdim=True)
|
|
16
|
+
x = x * torch.rsqrt(variance + self.eps)
|
|
17
|
+
return self.weight * x
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RotaryEmbedding(nn.Module):
|
|
21
|
+
def __init__(self, head_dim: int, theta: float):
|
|
22
|
+
super().__init__()
|
|
23
|
+
inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
|
|
24
|
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
25
|
+
|
|
26
|
+
def forward(self, seq_len: int, device, dtype):
|
|
27
|
+
positions = torch.arange(seq_len, device=device, dtype=torch.float32)
|
|
28
|
+
freqs = torch.outer(positions, self.inv_freq.to(device))
|
|
29
|
+
emb = torch.cat((freqs, freqs), dim=-1)
|
|
30
|
+
return emb.cos().to(dtype), emb.sin().to(dtype)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def rotate_half(x: torch.Tensor) -> torch.Tensor:
|
|
34
|
+
x1, x2 = x.chunk(2, dim=-1)
|
|
35
|
+
return torch.cat((-x2, x1), dim=-1)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
|
|
39
|
+
cos = cos[None, None, :, :]
|
|
40
|
+
sin = sin[None, None, :, :]
|
|
41
|
+
q_embed = (q * cos) + (rotate_half(q) * sin)
|
|
42
|
+
k_embed = (k * cos) + (rotate_half(k) * sin)
|
|
43
|
+
return q_embed, k_embed
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
47
|
+
batch, n_kv_heads, seq_len, head_dim = x.shape
|
|
48
|
+
if n_rep == 1:
|
|
49
|
+
return x
|
|
50
|
+
x = x[:, :, None, :, :].expand(batch, n_kv_heads, n_rep, seq_len, head_dim)
|
|
51
|
+
return x.reshape(batch, n_kv_heads * n_rep, seq_len, head_dim)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class GroupedQueryAttention(nn.Module):
|
|
55
|
+
def __init__(self, config: AmaliaConfig):
|
|
56
|
+
super().__init__()
|
|
57
|
+
self.num_heads = config.num_attention_heads
|
|
58
|
+
self.num_kv_heads = config.num_key_value_heads
|
|
59
|
+
self.num_kv_groups = self.num_heads // self.num_kv_heads
|
|
60
|
+
self.head_dim = config.head_dim
|
|
61
|
+
|
|
62
|
+
self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
|
|
63
|
+
self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
|
|
64
|
+
self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
|
|
65
|
+
self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
|
|
66
|
+
|
|
67
|
+
def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
|
|
68
|
+
batch, seq_len, _ = x.shape
|
|
69
|
+
|
|
70
|
+
q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
|
71
|
+
k = self.k_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
|
72
|
+
v = self.v_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
|
73
|
+
|
|
74
|
+
q, k = apply_rotary_pos_emb(q, k, cos, sin)
|
|
75
|
+
|
|
76
|
+
k = repeat_kv(k, self.num_kv_groups)
|
|
77
|
+
v = repeat_kv(v, self.num_kv_groups)
|
|
78
|
+
|
|
79
|
+
attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
|
|
80
|
+
|
|
81
|
+
attn_out = attn_out.transpose(1, 2).reshape(batch, seq_len, self.num_heads * self.head_dim)
|
|
82
|
+
return self.o_proj(attn_out)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SwiGLU(nn.Module):
|
|
86
|
+
def __init__(self, config: AmaliaConfig):
|
|
87
|
+
super().__init__()
|
|
88
|
+
self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
|
|
89
|
+
self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
|
|
90
|
+
self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
|
|
91
|
+
|
|
92
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
93
|
+
return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DecoderLayer(nn.Module):
|
|
97
|
+
def __init__(self, config: AmaliaConfig):
|
|
98
|
+
super().__init__()
|
|
99
|
+
self.input_norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
|
|
100
|
+
self.attn = GroupedQueryAttention(config)
|
|
101
|
+
self.post_attn_norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
|
|
102
|
+
self.mlp = SwiGLU(config)
|
|
103
|
+
|
|
104
|
+
def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
|
|
105
|
+
x = x + self.attn(self.input_norm(x), cos, sin)
|
|
106
|
+
x = x + self.mlp(self.post_attn_norm(x))
|
|
107
|
+
return x
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class AmaliaModel(nn.Module):
|
|
111
|
+
def __init__(self, config: AmaliaConfig):
|
|
112
|
+
super().__init__()
|
|
113
|
+
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
|
|
114
|
+
self.rotary_emb = RotaryEmbedding(config.head_dim, config.rope_theta)
|
|
115
|
+
self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
|
|
116
|
+
self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
|
|
117
|
+
|
|
118
|
+
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
|
|
119
|
+
x = self.embed_tokens(input_ids)
|
|
120
|
+
cos, sin = self.rotary_emb(input_ids.shape[1], x.device, x.dtype)
|
|
121
|
+
for layer in self.layers:
|
|
122
|
+
x = layer(x, cos, sin)
|
|
123
|
+
return self.norm(x)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class AmaliaForCausalLM(nn.Module):
|
|
127
|
+
def __init__(self, config: AmaliaConfig):
|
|
128
|
+
super().__init__()
|
|
129
|
+
self.model = AmaliaModel(config)
|
|
130
|
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
|
131
|
+
|
|
132
|
+
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
|
|
133
|
+
hidden_states = self.model(input_ids)
|
|
134
|
+
return self.lm_head(hidden_states)
|
amalia_core/config.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class AmaliaConfig:
|
|
6
|
+
vocab_size: int = 128_000
|
|
7
|
+
hidden_size: int = 4096
|
|
8
|
+
intermediate_size: int = 12_288
|
|
9
|
+
num_hidden_layers: int = 42
|
|
10
|
+
num_attention_heads: int = 32
|
|
11
|
+
num_key_value_heads: int = 8
|
|
12
|
+
max_position_embeddings: int = 32_768
|
|
13
|
+
rope_theta: float = 1_000_000.0
|
|
14
|
+
rms_norm_eps: float = 1e-5
|
|
15
|
+
tie_word_embeddings: bool = False
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def head_dim(self) -> int:
|
|
19
|
+
return self.hidden_size // self.num_attention_heads
|