amalia 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: amalia
3
+ Version: 0.2.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.14
6
+ Requires-Dist: numpy>=2.5.0
7
+ Requires-Dist: torch>=2.12.1
8
+ Description-Content-Type: text/markdown
9
+
10
+ # AMALIA
11
+
12
+ AMALIA is a decoder-only transformer architecture inherited from EuroLLM-9B, implemented in plain PyTorch (no external attention kernels).
13
+
14
+ ## Usage in Google Colab
15
+
16
+ ```python
17
+ !pip install amalia
18
+
19
+ import torch
20
+ from amalia_core import AmaliaConfig, AmaliaForCausalLM
21
+
22
+ # Initialize the model with random weights
23
+ config = AmaliaConfig()
24
+ model = AmaliaForCausalLM(config)
25
+
26
+ # Run a forward pass on random token ids
27
+ input_ids = torch.randint(0, config.vocab_size, (1, 16))
28
+ logits = model(input_ids)
29
+
30
+ print(logits.shape) # torch.Size([1, 16, 128000])
31
+ ```
@@ -0,0 +1,6 @@
1
+ amalia_core/__init__.py,sha256=un23WcEPxoRbJguj7UV_ID0C29H_aPw42-V7XKXXpug,126
2
+ amalia_core/architecture.py,sha256=G_bELz6Tz3_wjDgU0qfR2WZReJE46_RYpC4VKCk74LQ,5434
3
+ amalia_core/config.py,sha256=5m8KXw7WMoLt0LHJS0dxx3cYyiCZG-CiPnvEKhqJNiA,513
4
+ amalia-0.2.0.dist-info/METADATA,sha256=1aGOKUuyo7jCVSegurjfTiihPO-vIuBZIgPW57cR3OQ,766
5
+ amalia-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
6
+ amalia-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ from .config import AmaliaConfig
2
+ from .architecture import AmaliaForCausalLM
3
+
4
+ __all__ = ["AmaliaConfig", "AmaliaForCausalLM"]
@@ -0,0 +1,134 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from .config import AmaliaConfig
6
+
7
+
8
+ class RMSNorm(nn.Module):
9
+ def __init__(self, hidden_size: int, eps: float):
10
+ super().__init__()
11
+ self.weight = nn.Parameter(torch.ones(hidden_size))
12
+ self.eps = eps
13
+
14
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
15
+ variance = x.pow(2).mean(dim=-1, keepdim=True)
16
+ x = x * torch.rsqrt(variance + self.eps)
17
+ return self.weight * x
18
+
19
+
20
+ class RotaryEmbedding(nn.Module):
21
+ def __init__(self, head_dim: int, theta: float):
22
+ super().__init__()
23
+ inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
24
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
25
+
26
+ def forward(self, seq_len: int, device, dtype):
27
+ positions = torch.arange(seq_len, device=device, dtype=torch.float32)
28
+ freqs = torch.outer(positions, self.inv_freq.to(device))
29
+ emb = torch.cat((freqs, freqs), dim=-1)
30
+ return emb.cos().to(dtype), emb.sin().to(dtype)
31
+
32
+
33
+ def rotate_half(x: torch.Tensor) -> torch.Tensor:
34
+ x1, x2 = x.chunk(2, dim=-1)
35
+ return torch.cat((-x2, x1), dim=-1)
36
+
37
+
38
+ def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
39
+ cos = cos[None, None, :, :]
40
+ sin = sin[None, None, :, :]
41
+ q_embed = (q * cos) + (rotate_half(q) * sin)
42
+ k_embed = (k * cos) + (rotate_half(k) * sin)
43
+ return q_embed, k_embed
44
+
45
+
46
+ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
47
+ batch, n_kv_heads, seq_len, head_dim = x.shape
48
+ if n_rep == 1:
49
+ return x
50
+ x = x[:, :, None, :, :].expand(batch, n_kv_heads, n_rep, seq_len, head_dim)
51
+ return x.reshape(batch, n_kv_heads * n_rep, seq_len, head_dim)
52
+
53
+
54
+ class GroupedQueryAttention(nn.Module):
55
+ def __init__(self, config: AmaliaConfig):
56
+ super().__init__()
57
+ self.num_heads = config.num_attention_heads
58
+ self.num_kv_heads = config.num_key_value_heads
59
+ self.num_kv_groups = self.num_heads // self.num_kv_heads
60
+ self.head_dim = config.head_dim
61
+
62
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
63
+ self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
64
+ self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
65
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
66
+
67
+ def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
68
+ batch, seq_len, _ = x.shape
69
+
70
+ q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
71
+ k = self.k_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
72
+ v = self.v_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
73
+
74
+ q, k = apply_rotary_pos_emb(q, k, cos, sin)
75
+
76
+ k = repeat_kv(k, self.num_kv_groups)
77
+ v = repeat_kv(v, self.num_kv_groups)
78
+
79
+ attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
80
+
81
+ attn_out = attn_out.transpose(1, 2).reshape(batch, seq_len, self.num_heads * self.head_dim)
82
+ return self.o_proj(attn_out)
83
+
84
+
85
+ class SwiGLU(nn.Module):
86
+ def __init__(self, config: AmaliaConfig):
87
+ super().__init__()
88
+ self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
89
+ self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
90
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
91
+
92
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
93
+ return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
94
+
95
+
96
+ class DecoderLayer(nn.Module):
97
+ def __init__(self, config: AmaliaConfig):
98
+ super().__init__()
99
+ self.input_norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
100
+ self.attn = GroupedQueryAttention(config)
101
+ self.post_attn_norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
102
+ self.mlp = SwiGLU(config)
103
+
104
+ def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
105
+ x = x + self.attn(self.input_norm(x), cos, sin)
106
+ x = x + self.mlp(self.post_attn_norm(x))
107
+ return x
108
+
109
+
110
+ class AmaliaModel(nn.Module):
111
+ def __init__(self, config: AmaliaConfig):
112
+ super().__init__()
113
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
114
+ self.rotary_emb = RotaryEmbedding(config.head_dim, config.rope_theta)
115
+ self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
116
+ self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
117
+
118
+ def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
119
+ x = self.embed_tokens(input_ids)
120
+ cos, sin = self.rotary_emb(input_ids.shape[1], x.device, x.dtype)
121
+ for layer in self.layers:
122
+ x = layer(x, cos, sin)
123
+ return self.norm(x)
124
+
125
+
126
+ class AmaliaForCausalLM(nn.Module):
127
+ def __init__(self, config: AmaliaConfig):
128
+ super().__init__()
129
+ self.model = AmaliaModel(config)
130
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
131
+
132
+ def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
133
+ hidden_states = self.model(input_ids)
134
+ return self.lm_head(hidden_states)
amalia_core/config.py ADDED
@@ -0,0 +1,19 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class AmaliaConfig:
6
+ vocab_size: int = 128_000
7
+ hidden_size: int = 4096
8
+ intermediate_size: int = 12_288
9
+ num_hidden_layers: int = 42
10
+ num_attention_heads: int = 32
11
+ num_key_value_heads: int = 8
12
+ max_position_embeddings: int = 32_768
13
+ rope_theta: float = 1_000_000.0
14
+ rms_norm_eps: float = 1e-5
15
+ tie_word_embeddings: bool = False
16
+
17
+ @property
18
+ def head_dim(self) -> int:
19
+ return self.hidden_size // self.num_attention_heads