toy 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/Makefile +211 -5
- data/README.md +1 -1
- data/lib/toy/compute.rb +9 -0
- data/lib/toy/compute_cuda.rb +8 -0
- data/lib/toy/compute_metal.rb +17 -0
- data/lib/toy/core/cli/new.rb +8 -0
- data/lib/toy/ffi/tinynn.rb +19 -0
- data/lib/toy/ffi/tinynn_cuda.rb +7 -0
- data/lib/toy/ffi/tinynn_metal.rb +5 -0
- data/lib/toy/llm/archs/layer_spec.rb +39 -0
- data/lib/toy/llm/archs/llama_arch.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_cuda.rb +62 -1
- data/lib/toy/llm/archs/llama_arch_metal.rb +62 -1
- data/lib/toy/llm/blocks/gdn_block.rb +176 -0
- data/lib/toy/llm/engine/gpt2_kv_engine.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +11 -0
- data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +11 -0
- data/lib/toy/llm/engine/llama_kv_engine.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +10 -2
- data/lib/toy/llm/engine/llama_kv_engine_metal.rb +10 -2
- data/lib/toy/llm/engine/llama_seq_engine.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +16 -1
- data/lib/toy/llm/engine/llama_seq_engine_metal.rb +16 -1
- data/lib/toy/llm/primitives/depth_scale.rb +33 -0
- data/lib/toy/llm/primitives/diff_attention.rb +71 -0
- data/lib/toy/llm/primitives/gdn.rb +188 -0
- data/lib/toy/llm/primitives/scalable_softmax.rb +37 -0
- data/lib/toy/run/eval_metal.rb +12 -0
- data/lib/toy/run/infer_metal.rb +19 -0
- data/lib/toy/run/train_gpt2_metal.rb +7 -0
- data/lib/toy/run/train_hybrid.rb +232 -0
- data/lib/toy/run/train_metal.rb +10 -0
- data/lib/toy/version.rb +4 -3
- data/tinynn/tinynn_backend_cuda.c +22 -0
- data/tinynn/tinynn_ggml.c +231 -0
- metadata +9 -2
data/tinynn/tinynn_ggml.c
CHANGED
|
@@ -90,6 +90,68 @@ static tnn_engine *g_engine_cpu = NULL;
|
|
|
90
90
|
static tnn_engine *g_engine_cuda[TNN_MAX_CUDA_DEVICES] = { NULL };
|
|
91
91
|
static tnn_engine *g_engine_metal = NULL;
|
|
92
92
|
|
|
93
|
+
/* toy#90 — Metal residency-set teardown drain.
|
|
94
|
+
*
|
|
95
|
+
* The ggml-metal backend keeps a process-lifetime residency-set
|
|
96
|
+
* collection on its (singleton) device. Each live Metal buffer adds
|
|
97
|
+
* itself to that collection on alloc and removes itself on free
|
|
98
|
+
* (vendor/ggml/src/ggml-metal/ggml-metal-device.m:1491/1594). The
|
|
99
|
+
* device is freed ONLY by a C++ static destructor at process exit
|
|
100
|
+
* (ggml-metal-device.cpp:12-26), at which point ggml asserts the
|
|
101
|
+
* collection is empty (ggml-metal-device.m:618 — a deliberate "you
|
|
102
|
+
* leaked GPU resources" guard). toy's Metal runners (lib/toy/run/)
|
|
103
|
+
* and the consumer scaffold (Toy::Device, lib/toy/compute_metal.rb)
|
|
104
|
+
* historically rely on process exit to reclaim everything and never
|
|
105
|
+
* call tnn_session_free, so a session's weights_buf is still alive at
|
|
106
|
+
* exit → the residency set is non-empty → SIGABRT (exit 134), AFTER
|
|
107
|
+
* compute already produced correct output. The CLI subprocess paths
|
|
108
|
+
* mask this with GGML_METAL_NO_RESIDENCY=1, but a directly-run
|
|
109
|
+
* consumer binary gets no such env (toy#27 runs 3-4).
|
|
110
|
+
*
|
|
111
|
+
* Fix: track every live Metal session so tnn_shutdown_engines can free
|
|
112
|
+
* them (draining their weights_buf, hence the residency set) before the
|
|
113
|
+
* static destructor runs. The registry is METAL-ONLY by construction —
|
|
114
|
+
* tnn_session_register is called only when s->engine == g_engine_metal
|
|
115
|
+
* — so CPU and CUDA session/teardown semantics are byte-for-byte
|
|
116
|
+
* unchanged (their sessions are never registered, never drained here). */
|
|
117
|
+
#define TNN_MAX_METAL_SESSIONS 256
|
|
118
|
+
static void *g_metal_sessions[TNN_MAX_METAL_SESSIONS] = { NULL };
|
|
119
|
+
static int g_metal_session_count = 0;
|
|
120
|
+
|
|
121
|
+
static void tnn_metal_session_register(void *sess)
|
|
122
|
+
{
|
|
123
|
+
if (g_metal_session_count >= TNN_MAX_METAL_SESSIONS) {
|
|
124
|
+
/* Fail loud (never silently drop): an undrained session leaks a
|
|
125
|
+
* residency set and re-trips the device-free assert. */
|
|
126
|
+
fprintf(stderr,
|
|
127
|
+
"[tnn] WARNING: more than %d live Metal sessions; "
|
|
128
|
+
"tnn_shutdown_engines cannot track this one and the "
|
|
129
|
+
"ggml-metal device-free residency assert may fire at "
|
|
130
|
+
"exit. Bump TNN_MAX_METAL_SESSIONS or free sessions "
|
|
131
|
+
"explicitly with tnn_session_free.\n",
|
|
132
|
+
TNN_MAX_METAL_SESSIONS);
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
g_metal_sessions[g_metal_session_count++] = sess;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
static void tnn_metal_session_unregister(void *sess)
|
|
139
|
+
{
|
|
140
|
+
for (int i = 0; i < g_metal_session_count; ++i) {
|
|
141
|
+
if (g_metal_sessions[i] == sess) {
|
|
142
|
+
/* compact: move the tail entry into the hole */
|
|
143
|
+
g_metal_sessions[i] = g_metal_sessions[g_metal_session_count - 1];
|
|
144
|
+
g_metal_sessions[g_metal_session_count - 1] = NULL;
|
|
145
|
+
--g_metal_session_count;
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/* Forward decl: tnn_session_free is defined further down; the drain in
|
|
152
|
+
* tnn_shutdown_engines needs it. */
|
|
153
|
+
void tnn_session_free(void *sess);
|
|
154
|
+
|
|
93
155
|
/* CUDA backend init with device selection. Weak stub returns NULL;
|
|
94
156
|
* strong override lives in tinynn_backend_cuda.c. */
|
|
95
157
|
__attribute__((weak))
|
|
@@ -199,6 +261,24 @@ static tnn_engine *tnn_engine_get_on(int backend_kind, int device)
|
|
|
199
261
|
* GPU between phases. */
|
|
200
262
|
void tnn_shutdown_engines(void)
|
|
201
263
|
{
|
|
264
|
+
/* toy#90 — drain any live Metal sessions FIRST. Each session_free
|
|
265
|
+
* frees s->weights_buf, whose ggml-metal buffer removes itself from
|
|
266
|
+
* the device's residency-set collection; only once every Metal
|
|
267
|
+
* buffer is freed is the device-free assert (ggml-metal-device.m:618)
|
|
268
|
+
* satisfied. tnn_session_free unregisters as it goes, so we always
|
|
269
|
+
* drain index 0 until the list is empty (no iterator invalidation).
|
|
270
|
+
* Metal-only: CPU/CUDA sessions are never registered. */
|
|
271
|
+
while (g_metal_session_count > 0) {
|
|
272
|
+
void *sess = g_metal_sessions[0];
|
|
273
|
+
if (!sess) { /* defensive: drop a stale NULL slot */
|
|
274
|
+
g_metal_sessions[0] = g_metal_sessions[g_metal_session_count - 1];
|
|
275
|
+
g_metal_sessions[g_metal_session_count - 1] = NULL;
|
|
276
|
+
--g_metal_session_count;
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
tnn_session_free(sess);
|
|
280
|
+
}
|
|
281
|
+
|
|
202
282
|
/* CPU + Metal: single slot each. */
|
|
203
283
|
tnn_engine **scalar_slots[] = { &g_engine_cpu, &g_engine_metal };
|
|
204
284
|
for (int i = 0; i < 2; ++i) {
|
|
@@ -387,6 +467,13 @@ void *tnn_session_new_on(int backend_kind, int device)
|
|
|
387
467
|
s->weights_map_base = NULL;
|
|
388
468
|
s->weights_map_size = 0;
|
|
389
469
|
s->last_graph = 0;
|
|
470
|
+
/* toy#90 — register Metal sessions so tnn_shutdown_engines can drain
|
|
471
|
+
* their residency-set-carrying buffers before the ggml-metal static
|
|
472
|
+
* destructor runs. Gated to the Metal engine: CPU/CUDA sessions are
|
|
473
|
+
* never tracked, keeping their lifecycle unchanged. */
|
|
474
|
+
if (e == g_engine_metal) {
|
|
475
|
+
tnn_metal_session_register((void *)s);
|
|
476
|
+
}
|
|
390
477
|
return (void *)s;
|
|
391
478
|
}
|
|
392
479
|
|
|
@@ -394,6 +481,9 @@ void tnn_session_free(void *sess)
|
|
|
394
481
|
{
|
|
395
482
|
if (!sess) return;
|
|
396
483
|
tnn_session *s = (tnn_session *)sess;
|
|
484
|
+
/* toy#90 — drop from the Metal drain registry (no-op for CPU/CUDA
|
|
485
|
+
* sessions, which were never registered). Idempotent. */
|
|
486
|
+
tnn_metal_session_unregister(sess);
|
|
397
487
|
if (s->weights_buf) ggml_backend_buffer_free(s->weights_buf);
|
|
398
488
|
if (s->weights_buf_mmap) ggml_backend_buffer_free(s->weights_buf_mmap);
|
|
399
489
|
if (s->ctx) ggml_free(s->ctx);
|
|
@@ -815,6 +905,38 @@ void *tnn_ssm_scan(void *sess, void *state, void *x, void *dt,
|
|
|
815
905
|
(struct ggml_tensor *)ids);
|
|
816
906
|
}
|
|
817
907
|
|
|
908
|
+
/* Gated DeltaNet recurrence core (Dragon / Qwen3-Next family). The q/k/v/g/beta
|
|
909
|
+
* projections + the short causal conv are built by the Ruby GDN primitive; this
|
|
910
|
+
* is the fused recurrence op only. Shapes (all F32): v=[S_v,H,T,B];
|
|
911
|
+
* q,k contiguous-rows; g=[1|S_v,H,T,B]; beta ne0==1; state=[S_v*S_v*H,K,B,1].
|
|
912
|
+
* out=[S_v*H, T*B + K*S_v*B, 1, 1] (token outputs then trailing state snapshots).
|
|
913
|
+
* Forward only in ggml — training backward is a separate hand-written kernel. */
|
|
914
|
+
void *tnn_gated_delta_net(void *sess, void *q, void *k, void *v,
|
|
915
|
+
void *g, void *beta, void *state)
|
|
916
|
+
{
|
|
917
|
+
if (!sess || !q || !k || !v || !g || !beta || !state) return NULL;
|
|
918
|
+
tnn_session *s = (tnn_session *)sess;
|
|
919
|
+
return (void *)ggml_gated_delta_net(s->ctx,
|
|
920
|
+
(struct ggml_tensor *)q,
|
|
921
|
+
(struct ggml_tensor *)k,
|
|
922
|
+
(struct ggml_tensor *)v,
|
|
923
|
+
(struct ggml_tensor *)g,
|
|
924
|
+
(struct ggml_tensor *)beta,
|
|
925
|
+
(struct ggml_tensor *)state);
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
/* 1-D convolution (kernel a, data b) with stride/pad/dilation — the short
|
|
929
|
+
* causal conv inside a GDN block (also generally useful). */
|
|
930
|
+
void *tnn_conv_1d(void *sess, void *a, void *b, int s0, int p0, int d0)
|
|
931
|
+
{
|
|
932
|
+
if (!sess || !a || !b) return NULL;
|
|
933
|
+
tnn_session *s = (tnn_session *)sess;
|
|
934
|
+
return (void *)ggml_conv_1d(s->ctx,
|
|
935
|
+
(struct ggml_tensor *)a,
|
|
936
|
+
(struct ggml_tensor *)b,
|
|
937
|
+
s0, p0, d0);
|
|
938
|
+
}
|
|
939
|
+
|
|
818
940
|
void *tnn_gelu(void *sess, void *a)
|
|
819
941
|
{
|
|
820
942
|
if (!sess || !a) return NULL;
|
|
@@ -825,6 +947,101 @@ void *tnn_gelu(void *sess, void *a)
|
|
|
825
947
|
return (void *)ggml_gelu(s->ctx, (struct ggml_tensor *)a);
|
|
826
948
|
}
|
|
827
949
|
|
|
950
|
+
/* Unary/binary elementwise ops used to compose the GDN gate math, differential
|
|
951
|
+
* attention, and gated output norm (Dragon/GDN Phase 2). All thin ggml wraps. */
|
|
952
|
+
void *tnn_sigmoid(void *sess, void *a)
|
|
953
|
+
{
|
|
954
|
+
if (!sess || !a) return NULL;
|
|
955
|
+
tnn_session *s = (tnn_session *)sess;
|
|
956
|
+
return (void *)ggml_sigmoid(s->ctx, (struct ggml_tensor *)a);
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
void *tnn_exp(void *sess, void *a)
|
|
960
|
+
{
|
|
961
|
+
if (!sess || !a) return NULL;
|
|
962
|
+
tnn_session *s = (tnn_session *)sess;
|
|
963
|
+
return (void *)ggml_exp(s->ctx, (struct ggml_tensor *)a);
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
void *tnn_log(void *sess, void *a)
|
|
967
|
+
{
|
|
968
|
+
if (!sess || !a) return NULL;
|
|
969
|
+
tnn_session *s = (tnn_session *)sess;
|
|
970
|
+
return (void *)ggml_log(s->ctx, (struct ggml_tensor *)a);
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
void *tnn_neg(void *sess, void *a)
|
|
974
|
+
{
|
|
975
|
+
if (!sess || !a) return NULL;
|
|
976
|
+
tnn_session *s = (tnn_session *)sess;
|
|
977
|
+
return (void *)ggml_neg(s->ctx, (struct ggml_tensor *)a);
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
void *tnn_sub(void *sess, void *a, void *b)
|
|
981
|
+
{
|
|
982
|
+
if (!sess || !a || !b) return NULL;
|
|
983
|
+
tnn_session *s = (tnn_session *)sess;
|
|
984
|
+
return (void *)ggml_sub(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
void *tnn_sqrt(void *sess, void *a)
|
|
988
|
+
{
|
|
989
|
+
/* elementwise sqrt; has ggml backward (GGML_OP_SQRT). Used to compose a
|
|
990
|
+
* backward-friendly L2 norm (L2_NORM itself has no ggml backward). */
|
|
991
|
+
if (!sess || !a) return NULL;
|
|
992
|
+
tnn_session *s = (tnn_session *)sess;
|
|
993
|
+
return (void *)ggml_sqrt(s->ctx, (struct ggml_tensor *)a);
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
void *tnn_repeat(void *sess, void *a, void *b)
|
|
997
|
+
{
|
|
998
|
+
/* ggml_repeat: broadcast `a` up to the shape of `b`. Has ggml backward
|
|
999
|
+
* (GGML_OP_REPEAT → repeat_back, which SUMS the grad back down to a's
|
|
1000
|
+
* shape). Used to materialise a broadcast operand explicitly so a later
|
|
1001
|
+
* same-shape op (e.g. DIV, whose backward does NOT reduce a broadcast
|
|
1002
|
+
* src1) sees matching shapes and the grad reduction happens through the
|
|
1003
|
+
* well-formed REPEAT backward instead. */
|
|
1004
|
+
if (!sess || !a || !b) return NULL;
|
|
1005
|
+
tnn_session *s = (tnn_session *)sess;
|
|
1006
|
+
return (void *)ggml_repeat(s->ctx, (struct ggml_tensor *)a,
|
|
1007
|
+
(struct ggml_tensor *)b);
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
void *tnn_div(void *sess, void *a, void *b)
|
|
1011
|
+
{
|
|
1012
|
+
/* elementwise a/b with ggml broadcast (b repeats into a); has ggml backward
|
|
1013
|
+
* (GGML_OP_DIV). The divisor in the composed L2 is [1,H,T] broadcasting over
|
|
1014
|
+
* the [S_v,H,T] numerator. */
|
|
1015
|
+
if (!sess || !a || !b) return NULL;
|
|
1016
|
+
tnn_session *s = (tnn_session *)sess;
|
|
1017
|
+
return (void *)ggml_div(s->ctx, (struct ggml_tensor *)a, (struct ggml_tensor *)b);
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
/* L2-normalise rows along ne0 (q/k normalisation for the delta rule). */
|
|
1021
|
+
void *tnn_l2_norm(void *sess, void *a, double eps)
|
|
1022
|
+
{
|
|
1023
|
+
if (!sess || !a) return NULL;
|
|
1024
|
+
tnn_session *s = (tnn_session *)sess;
|
|
1025
|
+
return (void *)ggml_l2_norm(s->ctx, (struct ggml_tensor *)a, (float)eps);
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
/* softplus(x) = log(1 + exp(x)) — the GDN log-decay gate. */
|
|
1029
|
+
void *tnn_softplus(void *sess, void *a)
|
|
1030
|
+
{
|
|
1031
|
+
if (!sess || !a) return NULL;
|
|
1032
|
+
tnn_session *s = (tnn_session *)sess;
|
|
1033
|
+
return (void *)ggml_softplus(s->ctx, (struct ggml_tensor *)a);
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
/* scale + bias: s*x + b (compile-time scalars). Used for SSMax (s*log n + b)
|
|
1037
|
+
* and depth scaling. */
|
|
1038
|
+
void *tnn_scale_bias(void *sess, void *a, double s, double b)
|
|
1039
|
+
{
|
|
1040
|
+
if (!sess || !a) return NULL;
|
|
1041
|
+
tnn_session *sx = (tnn_session *)sess;
|
|
1042
|
+
return (void *)ggml_scale_bias(sx->ctx, (struct ggml_tensor *)a, (float)s, (float)b);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
828
1045
|
void *tnn_rms_norm(void *sess, void *x, void *gamma_row, double eps)
|
|
829
1046
|
{
|
|
830
1047
|
if (!sess || !x || !gamma_row) return NULL;
|
|
@@ -1316,6 +1533,20 @@ void *tnn_input_1d_i32(void *sess, int n)
|
|
|
1316
1533
|
return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, (int64_t)n);
|
|
1317
1534
|
}
|
|
1318
1535
|
|
|
1536
|
+
/* Persistent i32 input in ctx_w (#1449): a graph INPUT read across the
|
|
1537
|
+
* forward->backward boundary must not live in the galloc compute arena, where
|
|
1538
|
+
* galloc (seeing it as dead after the forward gather) frees its offset and
|
|
1539
|
+
* reuses it for the loss output -> backward get_rows reads loss bits as a wild
|
|
1540
|
+
* index. ctx_w is galloc-external and survives reset_for_rebuild. Allocated
|
|
1541
|
+
* before tnn_finalize_weights; re-uploaded each step. */
|
|
1542
|
+
void *tnn_input_1d_i32_persistent(void *sess, int n)
|
|
1543
|
+
{
|
|
1544
|
+
if (!sess || n <= 0) return NULL;
|
|
1545
|
+
tnn_session *s = (tnn_session *)sess;
|
|
1546
|
+
if (s->weights_finalized) return NULL;
|
|
1547
|
+
return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_I32, (int64_t)n);
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1319
1550
|
void tnn_gelu_back_scratch(void *sess, int n)
|
|
1320
1551
|
{
|
|
1321
1552
|
if (!sess || n <= 0) return;
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: toy
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ori Pekelman
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-06-
|
|
11
|
+
date: 2026-06-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: |-
|
|
14
14
|
Toy is a pure-Ruby transformer LM that compiles to a native binary
|
|
@@ -74,9 +74,11 @@ files:
|
|
|
74
74
|
- lib/toy/io/toy_events.rb
|
|
75
75
|
- lib/toy/io/toy_image_loader.rb
|
|
76
76
|
- lib/toy/llm/adamw.rb
|
|
77
|
+
- lib/toy/llm/archs/layer_spec.rb
|
|
77
78
|
- lib/toy/llm/archs/llama_arch.rb
|
|
78
79
|
- lib/toy/llm/archs/llama_arch_cuda.rb
|
|
79
80
|
- lib/toy/llm/archs/llama_arch_metal.rb
|
|
81
|
+
- lib/toy/llm/blocks/gdn_block.rb
|
|
80
82
|
- lib/toy/llm/blocks/transformer_block.rb
|
|
81
83
|
- lib/toy/llm/blocks/transformer_block_cuda.rb
|
|
82
84
|
- lib/toy/llm/blocks/transformer_block_metal.rb
|
|
@@ -98,6 +100,9 @@ files:
|
|
|
98
100
|
- lib/toy/llm/engine/llama_seq_engine_metal.rb
|
|
99
101
|
- lib/toy/llm/engine/vit_tiny_engine.rb
|
|
100
102
|
- lib/toy/llm/labels.rb
|
|
103
|
+
- lib/toy/llm/primitives/depth_scale.rb
|
|
104
|
+
- lib/toy/llm/primitives/diff_attention.rb
|
|
105
|
+
- lib/toy/llm/primitives/gdn.rb
|
|
101
106
|
- lib/toy/llm/primitives/gqa.rb
|
|
102
107
|
- lib/toy/llm/primitives/gqa_cuda.rb
|
|
103
108
|
- lib/toy/llm/primitives/gqa_metal.rb
|
|
@@ -107,6 +112,7 @@ files:
|
|
|
107
112
|
- lib/toy/llm/primitives/rope.rb
|
|
108
113
|
- lib/toy/llm/primitives/rope_cuda.rb
|
|
109
114
|
- lib/toy/llm/primitives/rope_metal.rb
|
|
115
|
+
- lib/toy/llm/primitives/scalable_softmax.rb
|
|
110
116
|
- lib/toy/llm/primitives/swiglu.rb
|
|
111
117
|
- lib/toy/llm/primitives/swiglu_cuda.rb
|
|
112
118
|
- lib/toy/llm/primitives/swiglu_metal.rb
|
|
@@ -146,6 +152,7 @@ files:
|
|
|
146
152
|
- lib/toy/run/train_gpt2.rb
|
|
147
153
|
- lib/toy/run/train_gpt2_cuda.rb
|
|
148
154
|
- lib/toy/run/train_gpt2_metal.rb
|
|
155
|
+
- lib/toy/run/train_hybrid.rb
|
|
149
156
|
- lib/toy/run/train_lora.rb
|
|
150
157
|
- lib/toy/run/train_lora_cuda.rb
|
|
151
158
|
- lib/toy/run/train_metal.rb
|