tensor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/AGENTS.md +38 -0
  3. data/extconf.rb +148 -0
  4. data/lib/tensor.rb +6 -0
  5. data/ruby_matrix.c +1367 -0
  6. metadata +46 -0
data/ruby_matrix.c ADDED
@@ -0,0 +1,1367 @@
1
+ #include <ruby.h>
2
+ #include <math.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include <stdint.h>
6
+
7
+ #ifdef _OPENMP
8
+ #include <omp.h>
9
+ #endif
10
+
11
+ // Define data types
12
+ typedef enum {
13
+ DTYPE_FLOAT64,
14
+ DTYPE_FLOAT32,
15
+ DTYPE_INT16,
16
+ DTYPE_INT8
17
+ } DataType;
18
+
19
+ // Define a Matrix/Tensor struct (order-2 today, extensible to higher orders)
20
+ typedef struct {
21
+ size_t rows;
22
+ size_t cols;
23
+ void *data; // Pointer to data (numeric storage)
24
+ DataType dtype; // Data type of the tensor
25
+ size_t rank; // Tensor rank (2 for matrices)
26
+ size_t *dims; // Dimensions array of length `rank` (optional today)
27
+ } Matrix;
28
+
29
+ // Compute total number of elements in a tensor
30
+ static size_t tensor_numel(const Matrix *tensor) {
31
+ if (tensor->rank > 0 && tensor->dims) {
32
+ size_t n = 1;
33
+ for (size_t i = 0; i < tensor->rank; i++) {
34
+ n *= tensor->dims[i];
35
+ }
36
+ return n;
37
+ }
38
+ return tensor->rows * tensor->cols;
39
+ }
40
+
41
+ // Compute row-major strides for each dimension
42
+ static void tensor_compute_strides(const Matrix *tensor, size_t *strides_out) {
43
+ if (tensor->rank == 0 || !tensor->dims) {
44
+ strides_out[0] = 1;
45
+ return;
46
+ }
47
+ size_t r = tensor->rank;
48
+ strides_out[r - 1] = 1;
49
+ for (ssize_t i = (ssize_t)r - 2; i >= 0; i--) {
50
+ strides_out[i] = strides_out[i + 1] * tensor->dims[i + 1];
51
+ }
52
+ }
53
+
54
+ // Read a scalar value at a given flat index
55
+ static VALUE tensor_value_at(const Matrix *tensor, size_t index) {
56
+ if (tensor->dtype == DTYPE_FLOAT64) {
57
+ return DBL2NUM(((double *)tensor->data)[index]);
58
+ } else if (tensor->dtype == DTYPE_FLOAT32) {
59
+ return DBL2NUM((double)((float *)tensor->data)[index]);
60
+ } else if (tensor->dtype == DTYPE_INT16) {
61
+ return DBL2NUM((double)((int16_t *)tensor->data)[index]);
62
+ } else if (tensor->dtype == DTYPE_INT8) {
63
+ return DBL2NUM((double)((int8_t *)tensor->data)[index]);
64
+ } else {
65
+ rb_raise(rb_eArgError, "Unsupported data type in tensor_value_at");
66
+ }
67
+ }
68
+
69
+ // Write a scalar value at a given flat index
70
+ static void tensor_set_value_at(Matrix *tensor, size_t index, VALUE value) {
71
+ if (tensor->dtype == DTYPE_FLOAT64) {
72
+ ((double *)tensor->data)[index] = NUM2DBL(value);
73
+ } else if (tensor->dtype == DTYPE_FLOAT32) {
74
+ ((float *)tensor->data)[index] = (float)NUM2DBL(value);
75
+ } else if (tensor->dtype == DTYPE_INT16) {
76
+ ((int16_t *)tensor->data)[index] = (int16_t)NUM2INT(value);
77
+ } else if (tensor->dtype == DTYPE_INT8) {
78
+ ((int8_t *)tensor->data)[index] = (int8_t)NUM2INT(value);
79
+ } else {
80
+ rb_raise(rb_eArgError, "Unsupported data type in tensor_set_value_at");
81
+ }
82
+ }
83
+
84
+ // Get number of available threads
85
+ static size_t get_num_threads() {
86
+ #ifdef _OPENMP
87
+ return omp_get_max_threads(); // Default to max threads
88
+ #else
89
+ return 16; // Fallback value (adjust as needed)
90
+ #endif
91
+ }
92
+
93
+ // Function to allocate a new Matrix
94
+ Matrix *matrix_new(size_t rows, size_t cols, DataType dtype) {
95
+ if (rows == 0 || cols == 0) {
96
+ rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
97
+ }
98
+
99
+ Matrix *matrix = malloc(sizeof(Matrix));
100
+ if (!matrix) {
101
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix");
102
+ }
103
+ matrix->rows = rows;
104
+ matrix->cols = cols;
105
+ matrix->dtype = dtype;
106
+ matrix->rank = 2;
107
+ matrix->dims = NULL;
108
+
109
+ if (dtype == DTYPE_FLOAT64) {
110
+ matrix->data = calloc(rows * cols, sizeof(double));
111
+ } else if (dtype == DTYPE_FLOAT32) {
112
+ matrix->data = calloc(rows * cols, sizeof(float));
113
+ } else if (dtype == DTYPE_INT16) {
114
+ matrix->data = calloc(rows * cols, sizeof(int16_t));
115
+ } else if (dtype == DTYPE_INT8) {
116
+ matrix->data = calloc(rows * cols, sizeof(int8_t));
117
+ } else {
118
+ free(matrix);
119
+ rb_raise(rb_eArgError, "Unsupported data type");
120
+ }
121
+
122
+ if (!matrix->data) {
123
+ free(matrix);
124
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix data");
125
+ }
126
+
127
+ // Initialize dims for 2D tensor (matrix)
128
+ matrix->dims = malloc(2 * sizeof(size_t));
129
+ if (!matrix->dims) {
130
+ free(matrix->data);
131
+ free(matrix);
132
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
133
+ }
134
+ matrix->dims[0] = rows;
135
+ matrix->dims[1] = cols;
136
+
137
+ return matrix;
138
+ }
139
+
140
+ // Function to free a Matrix
141
+ void matrix_free(Matrix *matrix) {
142
+ if (matrix) {
143
+ if (matrix->data) free(matrix->data);
144
+ if (matrix->dims) free(matrix->dims);
145
+ free(matrix);
146
+ }
147
+ }
148
+
149
+ // Matrix multiplication with OpenMP
150
+ Matrix *matrix_multiply(const Matrix *a, const Matrix *b) {
151
+ if (a->cols != b->rows || a->dtype != b->dtype) {
152
+ rb_raise(rb_eArgError, "Incompatible matrix dimensions or data types");
153
+ }
154
+
155
+ if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
156
+ rb_raise(rb_eArgError, "matrix_multiply supports only float32/float64 matrices");
157
+ }
158
+
159
+ Matrix *result = matrix_new(a->rows, b->cols, a->dtype);
160
+ size_t num_threads = get_num_threads();
161
+
162
+ if (a->dtype == DTYPE_FLOAT64) {
163
+ #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(num_threads)
164
+ for (size_t i = 0; i < a->rows; i++) {
165
+ for (size_t j = 0; j < b->cols; j++) {
166
+ double sum = 0.0;
167
+ for (size_t k = 0; k < a->cols; k++) {
168
+ sum += ((double *)a->data)[i * a->cols + k] *
169
+ ((double *)b->data)[k * b->cols + j];
170
+ }
171
+ ((double *)result->data)[i * result->cols + j] = sum;
172
+ }
173
+ }
174
+ } else { // DTYPE_FLOAT32
175
+ #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(num_threads)
176
+ for (size_t i = 0; i < a->rows; i++) {
177
+ for (size_t j = 0; j < b->cols; j++) {
178
+ float sum = 0.0f;
179
+ for (size_t k = 0; k < a->cols; k++) {
180
+ sum += ((float *)a->data)[i * a->cols + k] *
181
+ ((float *)b->data)[k * b->cols + j];
182
+ }
183
+ ((float *)result->data)[i * result->cols + j] = sum;
184
+ }
185
+ }
186
+ }
187
+ return result;
188
+ }
189
+
190
+ // ReLU activation with OpenMP
191
+ void matrix_relu(Matrix *matrix) {
192
+ size_t size = matrix->rows * matrix->cols;
193
+ size_t num_threads = get_num_threads();
194
+
195
+ if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
196
+ rb_raise(rb_eArgError, "relu supports only float32/float64 matrices");
197
+ }
198
+
199
+ if (matrix->dtype == DTYPE_FLOAT64) {
200
+ double *data = (double *)matrix->data;
201
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
202
+ for (size_t i = 0; i < size; i++) {
203
+ data[i] = data[i] > 0 ? data[i] : 0;
204
+ }
205
+ } else { // DTYPE_FLOAT32
206
+ float *data = (float *)matrix->data;
207
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
208
+ for (size_t i = 0; i < size; i++) {
209
+ data[i] = data[i] > 0 ? data[i] : 0;
210
+ }
211
+ }
212
+ }
213
+
214
+ // ReLU gradient with OpenMP
215
+ Matrix *matrix_relu_grad(const Matrix *input) {
216
+ if (input->dtype != DTYPE_FLOAT64 && input->dtype != DTYPE_FLOAT32) {
217
+ rb_raise(rb_eArgError, "relu_grad supports only float32/float64 matrices");
218
+ }
219
+
220
+ Matrix *output = matrix_new(input->rows, input->cols, input->dtype);
221
+ size_t size = input->rows * input->cols;
222
+ size_t num_threads = get_num_threads();
223
+
224
+ if (input->dtype == DTYPE_FLOAT64) {
225
+ double *in_data = (double *)input->data;
226
+ double *out_data = (double *)output->data;
227
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
228
+ for (size_t i = 0; i < size; i++) {
229
+ out_data[i] = in_data[i] > 0 ? 1.0 : 0.0;
230
+ }
231
+ } else { // DTYPE_FLOAT32
232
+ float *in_data = (float *)input->data;
233
+ float *out_data = (float *)output->data;
234
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
235
+ for (size_t i = 0; i < size; i++) {
236
+ out_data[i] = in_data[i] > 0 ? 1.0f : 0.0f;
237
+ }
238
+ }
239
+ return output;
240
+ }
241
+
242
+ // Recursive helper to convert tensor to nested Ruby arrays
243
+ static VALUE tensor_to_a_recursive(const Matrix *tensor, size_t depth, size_t base_index, const size_t *strides) {
244
+ if (tensor->rank == 0 || !tensor->dims) {
245
+ // Treat as a flat vector
246
+ VALUE ary = rb_ary_new();
247
+ size_t total = tensor_numel(tensor);
248
+ for (size_t i = 0; i < total; i++) {
249
+ rb_ary_push(ary, tensor_value_at(tensor, i));
250
+ }
251
+ return ary;
252
+ }
253
+
254
+ size_t dim = tensor->dims[depth];
255
+ VALUE ary = rb_ary_new_capa((long)dim);
256
+ if (depth == tensor->rank - 1) {
257
+ // Last dimension: return scalars
258
+ for (size_t i = 0; i < dim; i++) {
259
+ size_t index = base_index + i * strides[depth];
260
+ rb_ary_push(ary, tensor_value_at(tensor, index));
261
+ }
262
+ } else {
263
+ // Nested arrays
264
+ for (size_t i = 0; i < dim; i++) {
265
+ size_t next_base = base_index + i * strides[depth];
266
+ rb_ary_push(ary, tensor_to_a_recursive(tensor, depth + 1, next_base, strides));
267
+ }
268
+ }
269
+ return ary;
270
+ }
271
+
272
+ // Convert tensor to nested Ruby arrays
273
+ VALUE matrix_to_a(const Matrix *matrix) {
274
+ if (matrix->rank <= 1 || !matrix->dims) {
275
+ // 0D/1D fallback: flat array
276
+ VALUE ary = rb_ary_new();
277
+ size_t total = tensor_numel(matrix);
278
+ for (size_t i = 0; i < total; i++) {
279
+ rb_ary_push(ary, tensor_value_at(matrix, i));
280
+ }
281
+ return ary;
282
+ }
283
+
284
+ size_t *strides = malloc(matrix->rank * sizeof(size_t));
285
+ if (!strides) {
286
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
287
+ }
288
+ tensor_compute_strides(matrix, strides);
289
+ VALUE result = tensor_to_a_recursive(matrix, 0, 0, strides);
290
+ free(strides);
291
+ return result;
292
+ }
293
+
294
+ // Ruby method bindings
295
+ VALUE rb_tensor_class = Qnil;
296
+
297
+ // Allocate a new Matrix (Ruby allocator function)
298
+ VALUE rb_matrix_allocate(VALUE klass) {
299
+ Matrix *matrix = malloc(sizeof(Matrix));
300
+ if (!matrix) {
301
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix");
302
+ }
303
+ matrix->rows = 0;
304
+ matrix->cols = 0;
305
+ matrix->data = NULL;
306
+ matrix->dtype = DTYPE_FLOAT64; // Default to FLOAT64
307
+ matrix->rank = 0;
308
+ matrix->dims = NULL;
309
+ return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
310
+ }
311
+
312
+ // Ruby method to initialize a Matrix
313
+ VALUE rb_matrix_initialize(int argc, VALUE *argv, VALUE self) {
314
+ Matrix *matrix;
315
+ Data_Get_Struct(self, Matrix, matrix);
316
+
317
+ if (matrix->data) {
318
+ free(matrix->data);
319
+ matrix->data = NULL;
320
+ }
321
+ if (matrix->dims) {
322
+ free(matrix->dims);
323
+ matrix->dims = NULL;
324
+ }
325
+
326
+ // Parse arguments
327
+ VALUE rows, cols, kwargs;
328
+ rb_scan_args(argc, argv, "2:", &rows, &cols, &kwargs);
329
+
330
+ size_t r = NUM2SIZET(rows);
331
+ size_t c = NUM2SIZET(cols);
332
+
333
+ if (r == 0 || c == 0) {
334
+ rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
335
+ }
336
+
337
+ matrix->rows = r;
338
+ matrix->cols = c;
339
+ matrix->rank = 2;
340
+
341
+ // Default dtype is float32
342
+ matrix->dtype = DTYPE_FLOAT32;
343
+
344
+ // Parse dtype from kwargs
345
+ if (!NIL_P(kwargs)) {
346
+ VALUE dtype_arg = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
347
+ if (!NIL_P(dtype_arg)) {
348
+ const char *dtype_str = StringValueCStr(dtype_arg);
349
+ if (strcmp(dtype_str, "float64") == 0) {
350
+ matrix->dtype = DTYPE_FLOAT64;
351
+ } else if (strcmp(dtype_str, "float32") == 0) {
352
+ matrix->dtype = DTYPE_FLOAT32;
353
+ } else if (strcmp(dtype_str, "int16") == 0) {
354
+ matrix->dtype = DTYPE_INT16;
355
+ } else if (strcmp(dtype_str, "int8") == 0) {
356
+ matrix->dtype = DTYPE_INT8;
357
+ } else {
358
+ rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
359
+ }
360
+ }
361
+ }
362
+
363
+ // Allocate memory based on dtype
364
+ if (matrix->dtype == DTYPE_FLOAT64) {
365
+ matrix->data = calloc(r * c, sizeof(double));
366
+ } else if (matrix->dtype == DTYPE_FLOAT32) {
367
+ matrix->data = calloc(r * c, sizeof(float));
368
+ } else if (matrix->dtype == DTYPE_INT16) {
369
+ matrix->data = calloc(r * c, sizeof(int16_t));
370
+ } else if (matrix->dtype == DTYPE_INT8) {
371
+ matrix->data = calloc(r * c, sizeof(int8_t));
372
+ } else {
373
+ rb_raise(rb_eArgError, "Unsupported data type");
374
+ }
375
+
376
+ if (!matrix->data) {
377
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix data");
378
+ }
379
+
380
+ // Initialize dims for 2D tensor (matrix)
381
+ matrix->dims = malloc(2 * sizeof(size_t));
382
+ if (!matrix->dims) {
383
+ free(matrix->data);
384
+ matrix->data = NULL;
385
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
386
+ }
387
+ matrix->dims[0] = r;
388
+ matrix->dims[1] = c;
389
+
390
+ return self;
391
+ }
392
+
393
+ Matrix *matrix_convert_dtype(const Matrix *input, DataType new_dtype) {
394
+ Matrix *result = matrix_new(input->rows, input->cols, new_dtype);
395
+
396
+ size_t size = input->rows * input->cols;
397
+ if (input->dtype == DTYPE_FLOAT64) {
398
+ double *in_data = (double *)input->data;
399
+ if (new_dtype == DTYPE_FLOAT32) {
400
+ float *out_data = (float *)result->data;
401
+ for (size_t i = 0; i < size; i++) {
402
+ out_data[i] = (float)in_data[i];
403
+ }
404
+ } else if (new_dtype == DTYPE_INT16) {
405
+ int16_t *out_data = (int16_t *)result->data;
406
+ for (size_t i = 0; i < size; i++) {
407
+ out_data[i] = (int16_t)in_data[i];
408
+ }
409
+ } else if (new_dtype == DTYPE_INT8) {
410
+ int8_t *out_data = (int8_t *)result->data;
411
+ for (size_t i = 0; i < size; i++) {
412
+ out_data[i] = (int8_t)in_data[i];
413
+ }
414
+ }
415
+ } else if (input->dtype == DTYPE_FLOAT32) {
416
+ float *in_data = (float *)input->data;
417
+ if (new_dtype == DTYPE_FLOAT64) {
418
+ double *out_data = (double *)result->data;
419
+ for (size_t i = 0; i < size; i++) {
420
+ out_data[i] = (double)in_data[i];
421
+ }
422
+ } else if (new_dtype == DTYPE_INT16) {
423
+ int16_t *out_data = (int16_t *)result->data;
424
+ for (size_t i = 0; i < size; i++) {
425
+ out_data[i] = (int16_t)in_data[i];
426
+ }
427
+ } else if (new_dtype == DTYPE_INT8) {
428
+ int8_t *out_data = (int8_t *)result->data;
429
+ for (size_t i = 0; i < size; i++) {
430
+ out_data[i] = (int8_t)in_data[i];
431
+ }
432
+ }
433
+ } else if (input->dtype == DTYPE_INT16) {
434
+ int16_t *in_data = (int16_t *)input->data;
435
+ if (new_dtype == DTYPE_FLOAT64) {
436
+ double *out_data = (double *)result->data;
437
+ for (size_t i = 0; i < size; i++) {
438
+ out_data[i] = (double)in_data[i];
439
+ }
440
+ } else if (new_dtype == DTYPE_FLOAT32) {
441
+ float *out_data = (float *)result->data;
442
+ for (size_t i = 0; i < size; i++) {
443
+ out_data[i] = (float)in_data[i];
444
+ }
445
+ } else if (new_dtype == DTYPE_INT8) {
446
+ int8_t *out_data = (int8_t *)result->data;
447
+ for (size_t i = 0; i < size; i++) {
448
+ out_data[i] = (int8_t)in_data[i];
449
+ }
450
+ }
451
+ } else if (input->dtype == DTYPE_INT8) {
452
+ int8_t *in_data = (int8_t *)input->data;
453
+ if (new_dtype == DTYPE_FLOAT64) {
454
+ double *out_data = (double *)result->data;
455
+ for (size_t i = 0; i < size; i++) {
456
+ out_data[i] = (double)in_data[i];
457
+ }
458
+ } else if (new_dtype == DTYPE_FLOAT32) {
459
+ float *out_data = (float *)result->data;
460
+ for (size_t i = 0; i < size; i++) {
461
+ out_data[i] = (float)in_data[i];
462
+ }
463
+ } else if (new_dtype == DTYPE_INT16) {
464
+ int16_t *out_data = (int16_t *)result->data;
465
+ for (size_t i = 0; i < size; i++) {
466
+ out_data[i] = (int16_t)in_data[i];
467
+ }
468
+ }
469
+ }
470
+
471
+ return result;
472
+ }
473
+
474
+ // Ruby method for dtype conversion
475
+ VALUE rb_matrix_convert_dtype(VALUE self, VALUE dtype_arg) {
476
+ Matrix *input;
477
+ Data_Get_Struct(self, Matrix, input);
478
+
479
+ const char *dtype_str = StringValueCStr(dtype_arg);
480
+ DataType new_dtype;
481
+ if (strcmp(dtype_str, "float64") == 0) {
482
+ new_dtype = DTYPE_FLOAT64;
483
+ } else if (strcmp(dtype_str, "float32") == 0) {
484
+ new_dtype = DTYPE_FLOAT32;
485
+ } else if (strcmp(dtype_str, "int16") == 0) {
486
+ new_dtype = DTYPE_INT16;
487
+ } else if (strcmp(dtype_str, "int8") == 0) {
488
+ new_dtype = DTYPE_INT8;
489
+ } else {
490
+ rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
491
+ }
492
+
493
+ Matrix *result = matrix_convert_dtype(input, new_dtype);
494
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
495
+ }
496
+
497
+ // Ruby method to access a single element of the matrix
498
+ VALUE rb_matrix_get_element(int argc, VALUE *argv, VALUE self) {
499
+ Matrix *tensor;
500
+ Data_Get_Struct(self, Matrix, tensor);
501
+
502
+ if (tensor->rank == 0 || !tensor->dims) {
503
+ rb_raise(rb_eArgError, "Tensor has no dimensions");
504
+ }
505
+
506
+ if ((size_t)argc != tensor->rank) {
507
+ rb_raise(rb_eArgError, "Expected %zu indices, got %d", tensor->rank, argc);
508
+ }
509
+
510
+ size_t *strides = malloc(tensor->rank * sizeof(size_t));
511
+ if (!strides) {
512
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
513
+ }
514
+ tensor_compute_strides(tensor, strides);
515
+
516
+ size_t index = 0;
517
+ for (size_t d = 0; d < tensor->rank; d++) {
518
+ long idx = NUM2LONG(argv[d]);
519
+ long dim = (long)tensor->dims[d];
520
+ if (idx < 0) {
521
+ idx += dim; // negative indexing from the end
522
+ }
523
+ if (idx < 0 || idx >= dim) {
524
+ free(strides);
525
+ rb_raise(rb_eArgError, "Index out of bounds");
526
+ }
527
+ index += (size_t)idx * strides[d];
528
+ }
529
+
530
+ VALUE result = tensor_value_at(tensor, index);
531
+ free(strides);
532
+ return result;
533
+ }
534
+
535
+ // Ruby method to set a single element of the matrix
536
+ VALUE rb_matrix_set_element(int argc, VALUE *argv, VALUE self) {
537
+ if (argc < 1) {
538
+ rb_raise(rb_eArgError, "Value required for assignment");
539
+ }
540
+
541
+ VALUE value = argv[argc - 1];
542
+ int index_count = argc - 1;
543
+
544
+ Matrix *tensor;
545
+ Data_Get_Struct(self, Matrix, tensor);
546
+
547
+ if (tensor->rank == 0 || !tensor->dims) {
548
+ rb_raise(rb_eArgError, "Tensor has no dimensions");
549
+ }
550
+
551
+ if ((size_t)index_count != tensor->rank) {
552
+ rb_raise(rb_eArgError, "Expected %zu indices, got %d", tensor->rank, index_count);
553
+ }
554
+
555
+ size_t *strides = malloc(tensor->rank * sizeof(size_t));
556
+ if (!strides) {
557
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
558
+ }
559
+ tensor_compute_strides(tensor, strides);
560
+
561
+ size_t index = 0;
562
+ for (size_t d = 0; d < tensor->rank; d++) {
563
+ long idx = NUM2LONG(argv[d]);
564
+ long dim = (long)tensor->dims[d];
565
+ if (idx < 0) {
566
+ idx += dim;
567
+ }
568
+ if (idx < 0 || idx >= dim) {
569
+ free(strides);
570
+ rb_raise(rb_eArgError, "Index out of bounds");
571
+ }
572
+ index += (size_t)idx * strides[d];
573
+ }
574
+
575
+ tensor_set_value_at(tensor, index, value);
576
+ free(strides);
577
+ return Qnil;
578
+ }
579
+
580
+ // Ruby method for matrix multiplication
581
+ VALUE rb_matrix_multiply(VALUE self, VALUE other) {
582
+ Matrix *a, *b;
583
+ Data_Get_Struct(self, Matrix, a);
584
+ Data_Get_Struct(other, Matrix, b);
585
+
586
+ Matrix *result = matrix_multiply(a, b);
587
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
588
+ }
589
+
590
+ // Ruby method for matrix subtraction
591
+ VALUE rb_matrix_subtract(VALUE self, VALUE other) {
592
+ Matrix *a, *b;
593
+ Data_Get_Struct(self, Matrix, a);
594
+ Data_Get_Struct(other, Matrix, b);
595
+
596
+ if (a->rows != b->rows || a->cols != b->cols || a->dtype != b->dtype) {
597
+ rb_raise(rb_eArgError, "Matrices must have the same dimensions and data types for subtraction");
598
+ }
599
+
600
+ if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
601
+ rb_raise(rb_eArgError, "subtract supports only float32/float64 matrices");
602
+ }
603
+
604
+ Matrix *result = matrix_new(a->rows, a->cols, a->dtype);
605
+ size_t num_threads = get_num_threads();
606
+
607
+ if (a->dtype == DTYPE_FLOAT64) {
608
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
609
+ for (size_t i = 0; i < a->rows; i++) {
610
+ for (size_t j = 0; j < a->cols; j++) {
611
+ ((double *)result->data)[i * a->cols + j] =
612
+ ((double *)a->data)[i * a->cols + j] -
613
+ ((double *)b->data)[i * a->cols + j];
614
+ }
615
+ }
616
+ } else { // DTYPE_FLOAT32
617
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
618
+ for (size_t i = 0; i < a->rows; i++) {
619
+ for (size_t j = 0; j < a->cols; j++) {
620
+ ((float *)result->data)[i * a->cols + j] =
621
+ ((float *)a->data)[i * a->cols + j] -
622
+ ((float *)b->data)[i * a->cols + j];
623
+ }
624
+ }
625
+ }
626
+
627
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
628
+ }
629
+
630
+ // Ruby method for ReLU activation
631
+ VALUE rb_matrix_relu(VALUE self) {
632
+ Matrix *matrix;
633
+ Data_Get_Struct(self, Matrix, matrix);
634
+
635
+ matrix_relu(matrix);
636
+ return self;
637
+ }
638
+
639
+ // Ruby method for ReLU gradient
640
+ VALUE rb_matrix_relu_grad(VALUE self) {
641
+ Matrix *matrix;
642
+ Data_Get_Struct(self, Matrix, matrix);
643
+
644
+ Matrix *result = matrix_relu_grad(matrix);
645
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
646
+ }
647
+
648
+ // Ruby method for transpose
649
+ VALUE rb_matrix_transpose(VALUE self) {
650
+ Matrix *matrix;
651
+ Data_Get_Struct(self, Matrix, matrix);
652
+
653
+ if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
654
+ rb_raise(rb_eArgError, "transpose supports only float32/float64 matrices");
655
+ }
656
+
657
+ Matrix *result = matrix_new(matrix->cols, matrix->rows, matrix->dtype);
658
+ size_t num_threads = get_num_threads();
659
+
660
+ if (matrix->dtype == DTYPE_FLOAT64) {
661
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
662
+ for (size_t i = 0; i < matrix->rows; i++) {
663
+ for (size_t j = 0; j < matrix->cols; j++) {
664
+ ((double *)result->data)[j * result->cols + i] =
665
+ ((double *)matrix->data)[i * matrix->cols + j];
666
+ }
667
+ }
668
+ } else { // DTYPE_FLOAT32
669
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
670
+ for (size_t i = 0; i < matrix->rows; i++) {
671
+ for (size_t j = 0; j < matrix->cols; j++) {
672
+ ((float *)result->data)[j * result->cols + i] =
673
+ ((float *)matrix->data)[i * matrix->cols + j];
674
+ }
675
+ }
676
+ }
677
+
678
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
679
+ }
680
+
681
+ // Ruby method for Hadamard product
682
+ VALUE rb_matrix_hadamard(VALUE self, VALUE other) {
683
+ Matrix *a, *b;
684
+ Data_Get_Struct(self, Matrix, a);
685
+ Data_Get_Struct(other, Matrix, b);
686
+
687
+ if (a->rows != b->rows || a->cols != b->cols || a->dtype != b->dtype) {
688
+ rb_raise(rb_eArgError, "Matrices must have the same dimensions and data types for Hadamard product");
689
+ }
690
+
691
+ if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
692
+ rb_raise(rb_eArgError, "hadamard supports only float32/float64 matrices");
693
+ }
694
+
695
+ Matrix *result = matrix_new(a->rows, a->cols, a->dtype);
696
+ size_t num_threads = get_num_threads();
697
+
698
+ if (a->dtype == DTYPE_FLOAT64) {
699
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
700
+ for (size_t i = 0; i < a->rows; i++) {
701
+ for (size_t j = 0; j < a->cols; j++) {
702
+ ((double *)result->data)[i * a->cols + j] =
703
+ ((double *)a->data)[i * a->cols + j] *
704
+ ((double *)b->data)[i * a->cols + j];
705
+ }
706
+ }
707
+ } else { // DTYPE_FLOAT32
708
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
709
+ for (size_t i = 0; i < a->rows; i++) {
710
+ for (size_t j = 0; j < a->cols; j++) {
711
+ ((float *)result->data)[i * a->cols + j] =
712
+ ((float *)a->data)[i * a->cols + j] *
713
+ ((float *)b->data)[i * a->cols + j];
714
+ }
715
+ }
716
+ }
717
+
718
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
719
+ }
720
+
721
+ // Ruby method for scalar multiplication
722
+ VALUE rb_matrix_scale(VALUE self, VALUE scalar) {
723
+ Matrix *matrix;
724
+ Data_Get_Struct(self, Matrix, matrix);
725
+
726
+ if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
727
+ rb_raise(rb_eArgError, "scale supports only float32/float64 matrices");
728
+ }
729
+
730
+ double s = NUM2DBL(scalar);
731
+ Matrix *result = matrix_new(matrix->rows, matrix->cols, matrix->dtype);
732
+ size_t num_threads = get_num_threads();
733
+
734
+ if (matrix->dtype == DTYPE_FLOAT64) {
735
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
736
+ for (size_t i = 0; i < matrix->rows; i++) {
737
+ for (size_t j = 0; j < matrix->cols; j++) {
738
+ ((double *)result->data)[i * matrix->cols + j] =
739
+ ((double *)matrix->data)[i * matrix->cols + j] * s;
740
+ }
741
+ }
742
+ } else { // DTYPE_FLOAT32
743
+ #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
744
+ for (size_t i = 0; i < matrix->rows; i++) {
745
+ for (size_t j = 0; j < matrix->cols; j++) {
746
+ ((float *)result->data)[i * matrix->cols + j] =
747
+ ((float *)matrix->data)[i * matrix->cols + j] * (float)s;
748
+ }
749
+ }
750
+ }
751
+
752
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
753
+ }
754
+
755
+ // Ruby method to convert matrix to a 2D array
756
+ VALUE rb_matrix_to_a(VALUE self) {
757
+ Matrix *matrix;
758
+ Data_Get_Struct(self, Matrix, matrix);
759
+ return matrix_to_a(matrix);
760
+ }
761
+
762
+ // Ruby methods for tensor metadata
763
+ VALUE rb_tensor_shape(VALUE self) {
764
+ Matrix *tensor;
765
+ Data_Get_Struct(self, Matrix, tensor);
766
+
767
+ if (tensor->rank == 0 || !tensor->dims) {
768
+ return rb_ary_new();
769
+ }
770
+
771
+ VALUE ary = rb_ary_new_capa((long)tensor->rank);
772
+ for (size_t i = 0; i < tensor->rank; i++) {
773
+ rb_ary_push(ary, SIZET2NUM(tensor->dims[i]));
774
+ }
775
+ return ary;
776
+ }
777
+
778
+ VALUE rb_tensor_rank(VALUE self) {
779
+ Matrix *tensor;
780
+ Data_Get_Struct(self, Matrix, tensor);
781
+ return SIZET2NUM(tensor->rank);
782
+ }
783
+
784
+ VALUE rb_tensor_size(VALUE self) {
785
+ Matrix *tensor;
786
+ Data_Get_Struct(self, Matrix, tensor);
787
+ return SIZET2NUM(tensor_numel(tensor));
788
+ }
789
+
790
+ // Ruby method to extract a specific row from the matrix
791
+ VALUE rb_matrix_row(VALUE self, VALUE row_index) {
792
+ Matrix *matrix;
793
+ Data_Get_Struct(self, Matrix, matrix);
794
+ size_t r = NUM2SIZET(row_index);
795
+
796
+ if (r >= matrix->rows) {
797
+ rb_raise(rb_eArgError, "Row index out of bounds");
798
+ }
799
+
800
+ VALUE row = rb_ary_new();
801
+ for (size_t j = 0; j < matrix->cols; j++) {
802
+ size_t index = r * matrix->cols + j;
803
+ if (matrix->dtype == DTYPE_FLOAT64) {
804
+ rb_ary_push(row, DBL2NUM(((double *)matrix->data)[index]));
805
+ } else if (matrix->dtype == DTYPE_FLOAT32) {
806
+ rb_ary_push(row, DBL2NUM((double)((float *)matrix->data)[index]));
807
+ } else if (matrix->dtype == DTYPE_INT16) {
808
+ rb_ary_push(row, DBL2NUM((double)((int16_t *)matrix->data)[index]));
809
+ } else if (matrix->dtype == DTYPE_INT8) {
810
+ rb_ary_push(row, DBL2NUM((double)((int8_t *)matrix->data)[index]));
811
+ } else {
812
+ rb_raise(rb_eArgError, "Unsupported data type in row");
813
+ }
814
+ }
815
+ return row;
816
+ }
817
+
818
+ // Ruby method to get the number of rows
819
+ VALUE rb_matrix_row_count(VALUE self) {
820
+ Matrix *matrix;
821
+ Data_Get_Struct(self, Matrix, matrix);
822
+ return SIZET2NUM(matrix->rows);
823
+ }
824
+
825
+ // Ruby method to get the number of columns
826
+ VALUE rb_matrix_column_count(VALUE self) {
827
+ Matrix *matrix;
828
+ Data_Get_Struct(self, Matrix, matrix);
829
+ return SIZET2NUM(matrix->cols);
830
+ }
831
+
832
+
833
+ // Create a matrix from a 2D array (Ruby Array of Arrays)
834
+ Matrix *matrix_from_arrays(VALUE arrays, DataType dtype) {
835
+ Check_Type(arrays, T_ARRAY);
836
+ size_t rows = RARRAY_LEN(arrays);
837
+
838
+ if (rows == 0) {
839
+ rb_raise(rb_eArgError, "arrays must contain at least one row");
840
+ }
841
+
842
+ VALUE first_row = rb_ary_entry(arrays, 0);
843
+ Check_Type(first_row, T_ARRAY);
844
+ size_t cols = RARRAY_LEN(first_row);
845
+
846
+ if (cols == 0) {
847
+ rb_raise(rb_eArgError, "arrays must contain at least one column");
848
+ }
849
+
850
+ if (dtype != DTYPE_FLOAT64 && dtype != DTYPE_FLOAT32) {
851
+ rb_raise(rb_eArgError, "from_arrays currently supports only float32/float64 dtypes");
852
+ }
853
+
854
+ // Allocate a new matrix with the specified data type
855
+ Matrix *matrix = matrix_new(rows, cols, dtype);
856
+
857
+ // Populate the matrix with data from the Ruby array
858
+ for (size_t i = 0; i < rows; i++) {
859
+ VALUE row = rb_ary_entry(arrays, i);
860
+ Check_Type(row, T_ARRAY);
861
+ if (RARRAY_LEN(row) != cols) {
862
+ rb_raise(rb_eArgError, "all rows must have the same length");
863
+ }
864
+ for (size_t j = 0; j < cols; j++) {
865
+ VALUE elem = rb_ary_entry(row, j);
866
+ if (dtype == DTYPE_FLOAT64) {
867
+ ((double *)matrix->data)[i * cols + j] = NUM2DBL(elem);
868
+ } else { // DTYPE_FLOAT32
869
+ ((float *)matrix->data)[i * cols + j] = (float)NUM2DBL(elem);
870
+ }
871
+ }
872
+ }
873
+ return matrix;
874
+ }
875
+
876
+ // Ruby class method to create a matrix from a 2D array
877
+ VALUE rb_matrix_from_arrays(int argc, VALUE *argv, VALUE klass) {
878
+ VALUE arrays, dtype;
879
+ rb_scan_args(argc, argv, "11", &arrays, &dtype); // Accept 1 required argument and 1 optional argument
880
+
881
+ // Default to FLOAT64 if dtype is not provided
882
+ DataType dt = (NIL_P(dtype)) ? DTYPE_FLOAT64 : (NUM2INT(dtype) == 32) ? DTYPE_FLOAT32 : DTYPE_FLOAT64;
883
+
884
+ // Create the matrix and wrap it in a Ruby object
885
+ Matrix *matrix = matrix_from_arrays(arrays, dt);
886
+ return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
887
+ }
888
+
889
+ // Helper to allocate a tensor with arbitrary shape
890
+ static Matrix *tensor_new_with_shape(size_t rank, const size_t *dims, DataType dtype) {
891
+ if (rank == 0) {
892
+ rb_raise(rb_eArgError, "Shape must have at least one dimension");
893
+ }
894
+
895
+ size_t total = 1;
896
+ for (size_t i = 0; i < rank; i++) {
897
+ if (dims[i] == 0) {
898
+ rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
899
+ }
900
+ total *= dims[i];
901
+ }
902
+
903
+ Matrix *tensor = malloc(sizeof(Matrix));
904
+ if (!tensor) {
905
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor");
906
+ }
907
+
908
+ tensor->rank = rank;
909
+ tensor->dims = malloc(rank * sizeof(size_t));
910
+ if (!tensor->dims) {
911
+ free(tensor);
912
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
913
+ }
914
+ for (size_t i = 0; i < rank; i++) {
915
+ tensor->dims[i] = dims[i];
916
+ }
917
+
918
+ // For compatibility with 2D code, set rows/cols so that rows * cols == total elements
919
+ if (rank == 1) {
920
+ tensor->rows = dims[0];
921
+ tensor->cols = 1;
922
+ } else {
923
+ tensor->rows = dims[0];
924
+ tensor->cols = total / tensor->rows;
925
+ }
926
+
927
+ tensor->dtype = dtype;
928
+
929
+ if (dtype == DTYPE_FLOAT64) {
930
+ tensor->data = calloc(total, sizeof(double));
931
+ } else if (dtype == DTYPE_FLOAT32) {
932
+ tensor->data = calloc(total, sizeof(float));
933
+ } else if (dtype == DTYPE_INT16) {
934
+ tensor->data = calloc(total, sizeof(int16_t));
935
+ } else if (dtype == DTYPE_INT8) {
936
+ tensor->data = calloc(total, sizeof(int8_t));
937
+ } else {
938
+ free(tensor->dims);
939
+ free(tensor);
940
+ rb_raise(rb_eArgError, "Unsupported data type");
941
+ }
942
+
943
+ if (!tensor->data) {
944
+ free(tensor->dims);
945
+ free(tensor);
946
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor data");
947
+ }
948
+
949
+ return tensor;
950
+ }
951
+
952
+ // Infer tensor shape from a nested Ruby array
953
+ static void infer_shape(VALUE array, size_t *rank_out, size_t **dims_out) {
954
+ VALUE current = array;
955
+ size_t rank = 0;
956
+
957
+ // First pass: follow first elements to determine rank and dimensions
958
+ while (RB_TYPE_P(current, T_ARRAY)) {
959
+ long len = RARRAY_LEN(current);
960
+ if (len == 0) {
961
+ rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
962
+ }
963
+ rank++;
964
+ current = rb_ary_entry(current, 0);
965
+ }
966
+
967
+ size_t *dims = malloc(rank * sizeof(size_t));
968
+ if (!dims) {
969
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
970
+ }
971
+
972
+ current = array;
973
+ for (size_t i = 0; i < rank; i++) {
974
+ Check_Type(current, T_ARRAY);
975
+ long len = RARRAY_LEN(current);
976
+ if (len <= 0) {
977
+ free(dims);
978
+ rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
979
+ }
980
+ dims[i] = (size_t)len;
981
+ current = rb_ary_entry(current, 0);
982
+ }
983
+
984
+ *rank_out = rank;
985
+ *dims_out = dims;
986
+ }
987
+
988
+ // Recursive helper to fill a tensor from a nested Ruby array
989
+ static void tensor_fill_from_array(VALUE value, Matrix *tensor, size_t depth, size_t base_index, const size_t *strides) {
990
+ if (depth == tensor->rank - 1) {
991
+ Check_Type(value, T_ARRAY);
992
+ if ((size_t)RARRAY_LEN(value) != tensor->dims[depth]) {
993
+ rb_raise(rb_eArgError, "All inner arrays must have the same length");
994
+ }
995
+ for (size_t i = 0; i < tensor->dims[depth]; i++) {
996
+ VALUE elem = rb_ary_entry(value, (long)i);
997
+ size_t index = base_index + i * strides[depth];
998
+ tensor_set_value_at(tensor, index, elem);
999
+ }
1000
+ } else {
1001
+ Check_Type(value, T_ARRAY);
1002
+ if ((size_t)RARRAY_LEN(value) != tensor->dims[depth]) {
1003
+ rb_raise(rb_eArgError, "All inner arrays must have the same length");
1004
+ }
1005
+ for (size_t i = 0; i < tensor->dims[depth]; i++) {
1006
+ VALUE sub = rb_ary_entry(value, (long)i);
1007
+ size_t next_base = base_index + i * strides[depth];
1008
+ tensor_fill_from_array(sub, tensor, depth + 1, next_base, strides);
1009
+ }
1010
+ }
1011
+ }
1012
+
1013
+ // Ruby class method to create a tensor from a nested Ruby array (N-D)
1014
+ VALUE rb_tensor_from_array(int argc, VALUE *argv, VALUE klass) {
1015
+ VALUE array, dtype_arg;
1016
+ rb_scan_args(argc, argv, "11", &array, &dtype_arg);
1017
+
1018
+ Check_Type(array, T_ARRAY);
1019
+
1020
+ const char *dtype_str = NIL_P(dtype_arg) ? "float32" : StringValueCStr(dtype_arg);
1021
+ DataType dtype;
1022
+ if (strcmp(dtype_str, "float64") == 0) {
1023
+ dtype = DTYPE_FLOAT64;
1024
+ } else if (strcmp(dtype_str, "float32") == 0) {
1025
+ dtype = DTYPE_FLOAT32;
1026
+ } else if (strcmp(dtype_str, "int16") == 0) {
1027
+ dtype = DTYPE_INT16;
1028
+ } else if (strcmp(dtype_str, "int8") == 0) {
1029
+ dtype = DTYPE_INT8;
1030
+ } else {
1031
+ rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
1032
+ }
1033
+
1034
+ size_t rank;
1035
+ size_t *dims;
1036
+ infer_shape(array, &rank, &dims);
1037
+
1038
+ Matrix *tensor = tensor_new_with_shape(rank, dims, dtype);
1039
+ size_t *strides = malloc(rank * sizeof(size_t));
1040
+ if (!strides) {
1041
+ free(dims);
1042
+ matrix_free(tensor);
1043
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
1044
+ }
1045
+ tensor_compute_strides(tensor, strides);
1046
+
1047
+ tensor_fill_from_array(array, tensor, 0, 0, strides);
1048
+
1049
+ free(dims);
1050
+ free(strides);
1051
+
1052
+ return Data_Wrap_Struct(klass, NULL, matrix_free, tensor);
1053
+ }
1054
+
1055
+ // Softmax computation for a matrix
1056
+ Matrix *matrix_softmax(const Matrix *input) {
1057
+ if (input->dtype != DTYPE_FLOAT64 && input->dtype != DTYPE_FLOAT32) {
1058
+ rb_raise(rb_eArgError, "softmax supports only float32/float64 matrices");
1059
+ }
1060
+
1061
+ // Create a result matrix with the same dimensions as the input
1062
+ Matrix *result = matrix_new(input->rows, input->cols, input->dtype);
1063
+
1064
+ if (input->dtype == DTYPE_FLOAT64) {
1065
+ for (size_t i = 0; i < input->rows; i++) {
1066
+ // Find the maximum value in the row for numerical stability
1067
+ double max_val = -INFINITY;
1068
+ for (size_t j = 0; j < input->cols; j++) {
1069
+ double val = ((double *)input->data)[i * input->cols + j];
1070
+ if (val > max_val) max_val = val;
1071
+ }
1072
+
1073
+ // Compute the sum of exponentials
1074
+ double sum = 0.0;
1075
+ for (size_t j = 0; j < input->cols; j++) {
1076
+ double exp_val = exp(((double *)input->data)[i * input->cols + j] - max_val);
1077
+ sum += exp_val;
1078
+ ((double *)result->data)[i * input->cols + j] = exp_val;
1079
+ }
1080
+
1081
+ // Normalize by dividing each element by the sum
1082
+ for (size_t j = 0; j < input->cols; j++) {
1083
+ ((double *)result->data)[i * input->cols + j] /= sum;
1084
+ }
1085
+ }
1086
+ } else { // DTYPE_FLOAT32
1087
+ for (size_t i = 0; i < input->rows; i++) {
1088
+ // Find the maximum value in the row for numerical stability
1089
+ float max_val = -INFINITY;
1090
+ for (size_t j = 0; j < input->cols; j++) {
1091
+ float val = ((float *)input->data)[i * input->cols + j];
1092
+ if (val > max_val) max_val = val;
1093
+ }
1094
+
1095
+ // Compute the sum of exponentials
1096
+ float sum = 0.0f;
1097
+ for (size_t j = 0; j < input->cols; j++) {
1098
+ float exp_val = expf(((float *)input->data)[i * input->cols + j] - max_val);
1099
+ sum += exp_val;
1100
+ ((float *)result->data)[i * input->cols + j] = exp_val;
1101
+ }
1102
+
1103
+ // Normalize by dividing each element by the sum
1104
+ for (size_t j = 0; j < input->cols; j++) {
1105
+ ((float *)result->data)[i * input->cols + j] /= sum;
1106
+ }
1107
+ }
1108
+ }
1109
+
1110
+ return result;
1111
+ }
1112
+
1113
+ // Cross-entropy loss and gradient computation
1114
+ typedef struct {
1115
+ double loss;
1116
+ Matrix *gradient;
1117
+ } LossGradient;
1118
+
1119
+ LossGradient *matrix_cross_entropy_loss(const Matrix *probs, const int *labels, size_t batch_size) {
1120
+ LossGradient *result = malloc(sizeof(LossGradient));
1121
+ if (!result) {
1122
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for LossGradient");
1123
+ }
1124
+
1125
+ if (probs->dtype != DTYPE_FLOAT64 && probs->dtype != DTYPE_FLOAT32) {
1126
+ rb_raise(rb_eArgError, "cross_entropy_loss supports only float32/float64 matrices");
1127
+ }
1128
+
1129
+ result->gradient = matrix_new(probs->rows, probs->cols, probs->dtype);
1130
+ double loss = 0.0;
1131
+
1132
+ if (probs->dtype == DTYPE_FLOAT64) {
1133
+ #pragma omp parallel for schedule(dynamic) reduction(+:loss) num_threads(get_num_threads())
1134
+ for (size_t i = 0; i < batch_size; i++) {
1135
+ if (labels[i] < 0 || (size_t)labels[i] >= probs->cols) {
1136
+ rb_raise(rb_eArgError, "Label index out of bounds");
1137
+ }
1138
+
1139
+ double prob = ((double *)probs->data)[i * probs->cols + labels[i]];
1140
+ loss -= log(fmax(prob, 1e-7)); // Add a small epsilon to avoid log(0)
1141
+
1142
+ for (size_t j = 0; j < probs->cols; j++) {
1143
+ double grad_val = ((double *)probs->data)[i * probs->cols + j];
1144
+ if ((int)j == labels[i]) grad_val -= 1.0;
1145
+ ((double *)result->gradient->data)[i * probs->cols + j] = grad_val / batch_size;
1146
+ }
1147
+ }
1148
+ } else { // DTYPE_FLOAT32
1149
+ #pragma omp parallel for schedule(dynamic) reduction(+:loss) num_threads(get_num_threads())
1150
+ for (size_t i = 0; i < batch_size; i++) {
1151
+ if (labels[i] < 0 || (size_t)labels[i] >= probs->cols) {
1152
+ rb_raise(rb_eArgError, "Label index out of bounds");
1153
+ }
1154
+
1155
+ float prob = ((float *)probs->data)[i * probs->cols + labels[i]];
1156
+ loss -= logf(fmaxf(prob, 1e-7f)); // Add a small epsilon to avoid log(0)
1157
+
1158
+ for (size_t j = 0; j < probs->cols; j++) {
1159
+ float grad_val = ((float *)probs->data)[i * probs->cols + j];
1160
+ if ((int)j == labels[i]) grad_val -= 1.0f;
1161
+ ((float *)result->gradient->data)[i * probs->cols + j] = grad_val / batch_size;
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ result->loss = loss / batch_size;
1167
+ return result;
1168
+ }
1169
+
1170
+ VALUE rb_matrix_cross_entropy_loss(VALUE self, VALUE labels) {
1171
+ Matrix *probs;
1172
+ Data_Get_Struct(self, Matrix, probs);
1173
+
1174
+ Check_Type(labels, T_ARRAY);
1175
+ size_t batch_size = RARRAY_LEN(labels);
1176
+
1177
+ if (batch_size != probs->rows) {
1178
+ rb_raise(rb_eArgError, "labels size must match number of rows in probabilities");
1179
+ }
1180
+ int *c_labels = malloc(batch_size * sizeof(int));
1181
+
1182
+ for (size_t i = 0; i < batch_size; i++) {
1183
+ c_labels[i] = NUM2INT(rb_ary_entry(labels, i));
1184
+ }
1185
+
1186
+ LossGradient *result = matrix_cross_entropy_loss(probs, c_labels, batch_size);
1187
+ free(c_labels);
1188
+
1189
+ // Return an array containing [loss, gradient]
1190
+ VALUE ret = rb_ary_new2(2);
1191
+ rb_ary_store(ret, 0, DBL2NUM(result->loss));
1192
+ rb_ary_store(ret, 1, Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result->gradient));
1193
+ free(result);
1194
+
1195
+ return ret;
1196
+ }
1197
+
1198
+ // Ruby method for softmax
1199
+ VALUE rb_matrix_softmax(VALUE self) {
1200
+ Matrix *input;
1201
+ Data_Get_Struct(self, Matrix, input);
1202
+
1203
+ Matrix *result = matrix_softmax(input);
1204
+ return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
1205
+ }
1206
+
1207
+ // Ruby class method to build a matrix using a block
1208
+ VALUE rb_matrix_build(int argc, VALUE *argv, VALUE klass) {
1209
+ VALUE rows, cols, kwargs;
1210
+ rb_scan_args(argc, argv, "2:", &rows, &cols, &kwargs);
1211
+
1212
+ size_t r = NUM2SIZET(rows);
1213
+ size_t c = NUM2SIZET(cols);
1214
+
1215
+ if (r == 0 || c == 0) {
1216
+ rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
1217
+ }
1218
+
1219
+ if (!rb_block_given_p()) {
1220
+ rb_raise(rb_eArgError, "Block is required for building the matrix");
1221
+ }
1222
+
1223
+ // Default dtype is FLOAT64
1224
+ DataType dtype = DTYPE_FLOAT64;
1225
+
1226
+ // Parse dtype from kwargs if provided
1227
+ if (!NIL_P(kwargs)) {
1228
+ VALUE dtype_arg = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
1229
+ if (!NIL_P(dtype_arg)) {
1230
+ const char *dtype_str = StringValueCStr(dtype_arg);
1231
+ if (strcmp(dtype_str, "float64") == 0) {
1232
+ dtype = DTYPE_FLOAT64;
1233
+ } else if (strcmp(dtype_str, "float32") == 0) {
1234
+ dtype = DTYPE_FLOAT32;
1235
+ } else if (strcmp(dtype_str, "int16") == 0) {
1236
+ dtype = DTYPE_INT16;
1237
+ } else if (strcmp(dtype_str, "int8") == 0) {
1238
+ dtype = DTYPE_INT8;
1239
+ } else {
1240
+ rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
1241
+ }
1242
+ }
1243
+ }
1244
+
1245
+ // Create a new matrix with the specified data type
1246
+ Matrix *matrix = matrix_new(r, c, dtype);
1247
+
1248
+ // Populate the matrix using the block
1249
+ for (size_t i = 0; i < r; i++) {
1250
+ for (size_t j = 0; j < c; j++) {
1251
+ VALUE args[2] = {SIZET2NUM(i), SIZET2NUM(j)};
1252
+ VALUE result = rb_yield_values2(2, args);
1253
+
1254
+ if (NIL_P(result)) {
1255
+ matrix_free(matrix);
1256
+ rb_raise(rb_eArgError, "Block must return a numeric value");
1257
+ }
1258
+
1259
+ if (matrix->dtype == DTYPE_FLOAT64) {
1260
+ ((double *)matrix->data)[i * c + j] = NUM2DBL(result);
1261
+ } else if (matrix->dtype == DTYPE_FLOAT32) {
1262
+ ((float *)matrix->data)[i * c + j] = (float)NUM2DBL(result);
1263
+ } else if (matrix->dtype == DTYPE_INT16) {
1264
+ ((int16_t *)matrix->data)[i * c + j] = (int16_t)NUM2INT(result);
1265
+ } else if (matrix->dtype == DTYPE_INT8) {
1266
+ ((int8_t *)matrix->data)[i * c + j] = (int8_t)NUM2INT(result);
1267
+ }
1268
+ }
1269
+ }
1270
+
1271
+ return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
1272
+ }
1273
+
1274
+ // Ruby class method: Tensor.zeros(shape:, dtype:)
1275
+ VALUE rb_tensor_zeros(int argc, VALUE *argv, VALUE klass) {
1276
+ VALUE kwargs;
1277
+ rb_scan_args(argc, argv, "0:", &kwargs);
1278
+
1279
+ if (NIL_P(kwargs)) {
1280
+ rb_raise(rb_eArgError, "Keyword arguments are required");
1281
+ }
1282
+
1283
+ VALUE shape_val = rb_hash_aref(kwargs, ID2SYM(rb_intern("shape")));
1284
+ if (NIL_P(shape_val)) {
1285
+ rb_raise(rb_eArgError, "Missing keyword: shape");
1286
+ }
1287
+ Check_Type(shape_val, T_ARRAY);
1288
+
1289
+ VALUE dtype_val = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
1290
+ const char *dtype_str = NIL_P(dtype_val) ? "float32" : StringValueCStr(dtype_val);
1291
+
1292
+ DataType dtype;
1293
+ if (strcmp(dtype_str, "float64") == 0) {
1294
+ dtype = DTYPE_FLOAT64;
1295
+ } else if (strcmp(dtype_str, "float32") == 0) {
1296
+ dtype = DTYPE_FLOAT32;
1297
+ } else if (strcmp(dtype_str, "int16") == 0) {
1298
+ dtype = DTYPE_INT16;
1299
+ } else if (strcmp(dtype_str, "int8") == 0) {
1300
+ dtype = DTYPE_INT8;
1301
+ } else {
1302
+ rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
1303
+ }
1304
+
1305
+ size_t rank = (size_t)RARRAY_LEN(shape_val);
1306
+ if (rank == 0) {
1307
+ rb_raise(rb_eArgError, "Shape must have at least one dimension");
1308
+ }
1309
+
1310
+ size_t *dims = malloc(rank * sizeof(size_t));
1311
+ if (!dims) {
1312
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
1313
+ }
1314
+
1315
+ for (size_t i = 0; i < rank; i++) {
1316
+ VALUE dim_val = rb_ary_entry(shape_val, (long)i);
1317
+ size_t dim = NUM2SIZET(dim_val);
1318
+ if (dim == 0) {
1319
+ free(dims);
1320
+ rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
1321
+ }
1322
+ dims[i] = dim;
1323
+ }
1324
+
1325
+ Matrix *tensor = tensor_new_with_shape(rank, dims, dtype);
1326
+ free(dims);
1327
+
1328
+ return Data_Wrap_Struct(klass, NULL, matrix_free, tensor);
1329
+ }
1330
+
1331
+ void Init_tensor_ext() {
1332
+ // Define the Tensor class (and keep Matrix as an alias)
1333
+ rb_tensor_class = rb_define_class("Tensor", rb_cObject);
1334
+ rb_define_const(rb_cObject, "Matrix", rb_tensor_class);
1335
+ rb_define_alloc_func(rb_tensor_class, rb_matrix_allocate);
1336
+
1337
+ // Class Methods
1338
+ rb_define_singleton_method(rb_tensor_class, "from_arrays", rb_matrix_from_arrays, -1);
1339
+ rb_define_singleton_method(rb_tensor_class, "build", rb_matrix_build, -1);
1340
+ rb_define_singleton_method(rb_tensor_class, "zeros", rb_tensor_zeros, -1);
1341
+ rb_define_singleton_method(rb_tensor_class, "from_array", rb_tensor_from_array, -1);
1342
+
1343
+ // Instance Methods
1344
+ rb_define_method(rb_tensor_class, "initialize", rb_matrix_initialize, -1);
1345
+ rb_define_method(rb_tensor_class, "multiply", rb_matrix_multiply, 1);
1346
+ rb_define_method(rb_tensor_class, "subtract", rb_matrix_subtract, 1);
1347
+ rb_define_method(rb_tensor_class, "relu", rb_matrix_relu, 0);
1348
+ rb_define_method(rb_tensor_class, "relu_grad", rb_matrix_relu_grad, 0);
1349
+ rb_define_method(rb_tensor_class, "transpose", rb_matrix_transpose, 0);
1350
+ rb_define_method(rb_tensor_class, "hadamard", rb_matrix_hadamard, 1);
1351
+ rb_define_method(rb_tensor_class, "scale", rb_matrix_scale, 1);
1352
+ rb_define_method(rb_tensor_class, "to_a", rb_matrix_to_a, 0);
1353
+ rb_define_method(rb_tensor_class, "shape", rb_tensor_shape, 0);
1354
+ rb_define_method(rb_tensor_class, "rank", rb_tensor_rank, 0);
1355
+ rb_define_method(rb_tensor_class, "size", rb_tensor_size, 0);
1356
+ rb_define_method(rb_tensor_class, "row_count", rb_matrix_row_count, 0);
1357
+ rb_define_method(rb_tensor_class, "column_count", rb_matrix_column_count, 0);
1358
+ rb_define_method(rb_tensor_class, "row", rb_matrix_row, 1);
1359
+ rb_define_method(rb_tensor_class, "softmax", rb_matrix_softmax, 0);
1360
+ rb_define_method(rb_tensor_class, "cross_entropy_loss", rb_matrix_cross_entropy_loss, 1);
1361
+ rb_define_method(rb_tensor_class, "to_dtype", rb_matrix_convert_dtype, 1); // New method
1362
+ rb_define_method(rb_tensor_class, "[]", rb_matrix_get_element, -1); // Getter
1363
+ rb_define_method(rb_tensor_class, "[]=", rb_matrix_set_element, -1); // Setter
1364
+
1365
+ // Alias Methods
1366
+ rb_define_alias(rb_tensor_class, "matmul", "multiply");
1367
+ }