RubyGems - tensor - Versions diffs - 0.1.0 - Mend

tensor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/ruby_matrix.c ADDED Viewed

@@ -0,0 +1,1367 @@
+#include <ruby.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+// Define data types
+typedef enum {
+    DTYPE_FLOAT64,
+    DTYPE_FLOAT32,
+    DTYPE_INT16,
+    DTYPE_INT8
+} DataType;
+// Define a Matrix/Tensor struct (order-2 today, extensible to higher orders)
+typedef struct {
+    size_t rows;
+    size_t cols;
+    void  *data;        // Pointer to data (numeric storage)
+    DataType dtype;     // Data type of the tensor
+    size_t rank;        // Tensor rank (2 for matrices)
+    size_t *dims;       // Dimensions array of length `rank` (optional today)
+} Matrix;
+// Compute total number of elements in a tensor
+static size_t tensor_numel(const Matrix *tensor) {
+    if (tensor->rank > 0 && tensor->dims) {
+        size_t n = 1;
+        for (size_t i = 0; i < tensor->rank; i++) {
+            n *= tensor->dims[i];
+        }
+        return n;
+    }
+    return tensor->rows * tensor->cols;
+}
+// Compute row-major strides for each dimension
+static void tensor_compute_strides(const Matrix *tensor, size_t *strides_out) {
+    if (tensor->rank == 0 || !tensor->dims) {
+        strides_out[0] = 1;
+        return;
+    }
+    size_t r = tensor->rank;
+    strides_out[r - 1] = 1;
+    for (ssize_t i = (ssize_t)r - 2; i >= 0; i--) {
+        strides_out[i] = strides_out[i + 1] * tensor->dims[i + 1];
+    }
+}
+// Read a scalar value at a given flat index
+static VALUE tensor_value_at(const Matrix *tensor, size_t index) {
+    if (tensor->dtype == DTYPE_FLOAT64) {
+        return DBL2NUM(((double *)tensor->data)[index]);
+    } else if (tensor->dtype == DTYPE_FLOAT32) {
+        return DBL2NUM((double)((float *)tensor->data)[index]);
+    } else if (tensor->dtype == DTYPE_INT16) {
+        return DBL2NUM((double)((int16_t *)tensor->data)[index]);
+    } else if (tensor->dtype == DTYPE_INT8) {
+        return DBL2NUM((double)((int8_t *)tensor->data)[index]);
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type in tensor_value_at");
+    }
+}
+// Write a scalar value at a given flat index
+static void tensor_set_value_at(Matrix *tensor, size_t index, VALUE value) {
+    if (tensor->dtype == DTYPE_FLOAT64) {
+        ((double *)tensor->data)[index] = NUM2DBL(value);
+    } else if (tensor->dtype == DTYPE_FLOAT32) {
+        ((float *)tensor->data)[index] = (float)NUM2DBL(value);
+    } else if (tensor->dtype == DTYPE_INT16) {
+        ((int16_t *)tensor->data)[index] = (int16_t)NUM2INT(value);
+    } else if (tensor->dtype == DTYPE_INT8) {
+        ((int8_t *)tensor->data)[index] = (int8_t)NUM2INT(value);
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type in tensor_set_value_at");
+    }
+}
+// Get number of available threads
+static size_t get_num_threads() {
+    #ifdef _OPENMP
+        return omp_get_max_threads(); // Default to max threads
+    #else
+        return 16; // Fallback value (adjust as needed)
+    #endif
+}
+// Function to allocate a new Matrix
+Matrix *matrix_new(size_t rows, size_t cols, DataType dtype) {
+    if (rows == 0 || cols == 0) {
+        rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
+    }
+    Matrix *matrix = malloc(sizeof(Matrix));
+    if (!matrix) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix");
+    }
+    matrix->rows = rows;
+    matrix->cols = cols;
+    matrix->dtype = dtype;
+    matrix->rank = 2;
+    matrix->dims = NULL;
+    if (dtype == DTYPE_FLOAT64) {
+        matrix->data = calloc(rows * cols, sizeof(double));
+    } else if (dtype == DTYPE_FLOAT32) {
+        matrix->data = calloc(rows * cols, sizeof(float));
+    } else if (dtype == DTYPE_INT16) {
+        matrix->data = calloc(rows * cols, sizeof(int16_t));
+    } else if (dtype == DTYPE_INT8) {
+        matrix->data = calloc(rows * cols, sizeof(int8_t));
+    } else {
+        free(matrix);
+        rb_raise(rb_eArgError, "Unsupported data type");
+    }
+    if (!matrix->data) {
+        free(matrix);
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix data");
+    }
+    // Initialize dims for 2D tensor (matrix)
+    matrix->dims = malloc(2 * sizeof(size_t));
+    if (!matrix->dims) {
+        free(matrix->data);
+        free(matrix);
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
+    }
+    matrix->dims[0] = rows;
+    matrix->dims[1] = cols;
+    return matrix;
+}
+// Function to free a Matrix
+void matrix_free(Matrix *matrix) {
+    if (matrix) {
+        if (matrix->data) free(matrix->data);
+        if (matrix->dims) free(matrix->dims);
+        free(matrix);
+    }
+}
+// Matrix multiplication with OpenMP
+Matrix *matrix_multiply(const Matrix *a, const Matrix *b) {
+    if (a->cols != b->rows || a->dtype != b->dtype) {
+        rb_raise(rb_eArgError, "Incompatible matrix dimensions or data types");
+    }
+    if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "matrix_multiply supports only float32/float64 matrices");
+    }
+    Matrix *result = matrix_new(a->rows, b->cols, a->dtype);
+    size_t num_threads = get_num_threads();
+    if (a->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < b->cols; j++) {
+                double sum = 0.0;
+                for (size_t k = 0; k < a->cols; k++) {
+                    sum += ((double *)a->data)[i * a->cols + k] *
+                           ((double *)b->data)[k * b->cols + j];
+                }
+                ((double *)result->data)[i * result->cols + j] = sum;
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < b->cols; j++) {
+                float sum = 0.0f;
+                for (size_t k = 0; k < a->cols; k++) {
+                    sum += ((float *)a->data)[i * a->cols + k] *
+                           ((float *)b->data)[k * b->cols + j];
+                }
+                ((float *)result->data)[i * result->cols + j] = sum;
+            }
+        }
+    }
+    return result;
+}
+// ReLU activation with OpenMP
+void matrix_relu(Matrix *matrix) {
+    size_t size = matrix->rows * matrix->cols;
+    size_t num_threads = get_num_threads();
+    if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "relu supports only float32/float64 matrices");
+    }
+    if (matrix->dtype == DTYPE_FLOAT64) {
+        double *data = (double *)matrix->data;
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < size; i++) {
+            data[i] = data[i] > 0 ? data[i] : 0;
+        }
+    } else { // DTYPE_FLOAT32
+        float *data = (float *)matrix->data;
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < size; i++) {
+            data[i] = data[i] > 0 ? data[i] : 0;
+        }
+    }
+}
+// ReLU gradient with OpenMP
+Matrix *matrix_relu_grad(const Matrix *input) {
+    if (input->dtype != DTYPE_FLOAT64 && input->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "relu_grad supports only float32/float64 matrices");
+    }
+    Matrix *output = matrix_new(input->rows, input->cols, input->dtype);
+    size_t size = input->rows * input->cols;
+    size_t num_threads = get_num_threads();
+    if (input->dtype == DTYPE_FLOAT64) {
+        double *in_data = (double *)input->data;
+        double *out_data = (double *)output->data;
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < size; i++) {
+            out_data[i] = in_data[i] > 0 ? 1.0 : 0.0;
+        }
+    } else { // DTYPE_FLOAT32
+        float *in_data = (float *)input->data;
+        float *out_data = (float *)output->data;
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < size; i++) {
+            out_data[i] = in_data[i] > 0 ? 1.0f : 0.0f;
+        }
+    }
+    return output;
+}
+// Recursive helper to convert tensor to nested Ruby arrays
+static VALUE tensor_to_a_recursive(const Matrix *tensor, size_t depth, size_t base_index, const size_t *strides) {
+    if (tensor->rank == 0 || !tensor->dims) {
+        // Treat as a flat vector
+        VALUE ary = rb_ary_new();
+        size_t total = tensor_numel(tensor);
+        for (size_t i = 0; i < total; i++) {
+            rb_ary_push(ary, tensor_value_at(tensor, i));
+        }
+        return ary;
+    }
+    size_t dim = tensor->dims[depth];
+    VALUE ary = rb_ary_new_capa((long)dim);
+    if (depth == tensor->rank - 1) {
+        // Last dimension: return scalars
+        for (size_t i = 0; i < dim; i++) {
+            size_t index = base_index + i * strides[depth];
+            rb_ary_push(ary, tensor_value_at(tensor, index));
+        }
+    } else {
+        // Nested arrays
+        for (size_t i = 0; i < dim; i++) {
+            size_t next_base = base_index + i * strides[depth];
+            rb_ary_push(ary, tensor_to_a_recursive(tensor, depth + 1, next_base, strides));
+        }
+    }
+    return ary;
+}
+// Convert tensor to nested Ruby arrays
+VALUE matrix_to_a(const Matrix *matrix) {
+    if (matrix->rank <= 1 || !matrix->dims) {
+        // 0D/1D fallback: flat array
+        VALUE ary = rb_ary_new();
+        size_t total = tensor_numel(matrix);
+        for (size_t i = 0; i < total; i++) {
+            rb_ary_push(ary, tensor_value_at(matrix, i));
+        }
+        return ary;
+    }
+    size_t *strides = malloc(matrix->rank * sizeof(size_t));
+    if (!strides) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
+    }
+    tensor_compute_strides(matrix, strides);
+    VALUE result = tensor_to_a_recursive(matrix, 0, 0, strides);
+    free(strides);
+    return result;
+}
+// Ruby method bindings
+VALUE rb_tensor_class = Qnil;
+// Allocate a new Matrix (Ruby allocator function)
+VALUE rb_matrix_allocate(VALUE klass) {
+    Matrix *matrix = malloc(sizeof(Matrix));
+    if (!matrix) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix");
+    }
+    matrix->rows = 0;
+    matrix->cols = 0;
+    matrix->data = NULL;
+    matrix->dtype = DTYPE_FLOAT64; // Default to FLOAT64
+    matrix->rank = 0;
+    matrix->dims = NULL;
+    return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
+}
+// Ruby method to initialize a Matrix
+VALUE rb_matrix_initialize(int argc, VALUE *argv, VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    if (matrix->data) {
+        free(matrix->data);
+        matrix->data = NULL;
+    }
+    if (matrix->dims) {
+        free(matrix->dims);
+        matrix->dims = NULL;
+    }
+    // Parse arguments
+    VALUE rows, cols, kwargs;
+    rb_scan_args(argc, argv, "2:", &rows, &cols, &kwargs);
+    size_t r = NUM2SIZET(rows);
+    size_t c = NUM2SIZET(cols);
+    if (r == 0 || c == 0) {
+        rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
+    }
+    matrix->rows = r;
+    matrix->cols = c;
+    matrix->rank = 2;
+    // Default dtype is float32
+    matrix->dtype = DTYPE_FLOAT32;
+    // Parse dtype from kwargs
+    if (!NIL_P(kwargs)) {
+        VALUE dtype_arg = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
+        if (!NIL_P(dtype_arg)) {
+            const char *dtype_str = StringValueCStr(dtype_arg);
+            if (strcmp(dtype_str, "float64") == 0) {
+                matrix->dtype = DTYPE_FLOAT64;
+            } else if (strcmp(dtype_str, "float32") == 0) {
+                matrix->dtype = DTYPE_FLOAT32;
+            } else if (strcmp(dtype_str, "int16") == 0) {
+                matrix->dtype = DTYPE_INT16;
+            } else if (strcmp(dtype_str, "int8") == 0) {
+                matrix->dtype = DTYPE_INT8;
+            } else {
+                rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
+            }
+        }
+    }
+    // Allocate memory based on dtype
+    if (matrix->dtype == DTYPE_FLOAT64) {
+        matrix->data = calloc(r * c, sizeof(double));
+    } else if (matrix->dtype == DTYPE_FLOAT32) {
+        matrix->data = calloc(r * c, sizeof(float));
+    } else if (matrix->dtype == DTYPE_INT16) {
+        matrix->data = calloc(r * c, sizeof(int16_t));
+    } else if (matrix->dtype == DTYPE_INT8) {
+        matrix->data = calloc(r * c, sizeof(int8_t));
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type");
+    }
+    if (!matrix->data) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for matrix data");
+    }
+    // Initialize dims for 2D tensor (matrix)
+    matrix->dims = malloc(2 * sizeof(size_t));
+    if (!matrix->dims) {
+        free(matrix->data);
+        matrix->data = NULL;
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
+    }
+    matrix->dims[0] = r;
+    matrix->dims[1] = c;
+    return self;
+}
+Matrix *matrix_convert_dtype(const Matrix *input, DataType new_dtype) {
+    Matrix *result = matrix_new(input->rows, input->cols, new_dtype);
+    size_t size = input->rows * input->cols;
+    if (input->dtype == DTYPE_FLOAT64) {
+        double *in_data = (double *)input->data;
+        if (new_dtype == DTYPE_FLOAT32) {
+            float *out_data = (float *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (float)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT16) {
+            int16_t *out_data = (int16_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int16_t)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT8) {
+            int8_t *out_data = (int8_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int8_t)in_data[i];
+            }
+        }
+    } else if (input->dtype == DTYPE_FLOAT32) {
+        float *in_data = (float *)input->data;
+        if (new_dtype == DTYPE_FLOAT64) {
+            double *out_data = (double *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (double)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT16) {
+            int16_t *out_data = (int16_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int16_t)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT8) {
+            int8_t *out_data = (int8_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int8_t)in_data[i];
+            }
+        }
+    } else if (input->dtype == DTYPE_INT16) {
+        int16_t *in_data = (int16_t *)input->data;
+        if (new_dtype == DTYPE_FLOAT64) {
+            double *out_data = (double *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (double)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_FLOAT32) {
+            float *out_data = (float *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (float)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT8) {
+            int8_t *out_data = (int8_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int8_t)in_data[i];
+            }
+        }
+    } else if (input->dtype == DTYPE_INT8) {
+        int8_t *in_data = (int8_t *)input->data;
+        if (new_dtype == DTYPE_FLOAT64) {
+            double *out_data = (double *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (double)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_FLOAT32) {
+            float *out_data = (float *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (float)in_data[i];
+            }
+        } else if (new_dtype == DTYPE_INT16) {
+            int16_t *out_data = (int16_t *)result->data;
+            for (size_t i = 0; i < size; i++) {
+                out_data[i] = (int16_t)in_data[i];
+            }
+        }
+    }
+    return result;
+}
+// Ruby method for dtype conversion
+VALUE rb_matrix_convert_dtype(VALUE self, VALUE dtype_arg) {
+    Matrix *input;
+    Data_Get_Struct(self, Matrix, input);
+    const char *dtype_str = StringValueCStr(dtype_arg);
+    DataType new_dtype;
+    if (strcmp(dtype_str, "float64") == 0) {
+        new_dtype = DTYPE_FLOAT64;
+    } else if (strcmp(dtype_str, "float32") == 0) {
+        new_dtype = DTYPE_FLOAT32;
+    } else if (strcmp(dtype_str, "int16") == 0) {
+        new_dtype = DTYPE_INT16;
+    } else if (strcmp(dtype_str, "int8") == 0) {
+        new_dtype = DTYPE_INT8;
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
+    }
+    Matrix *result = matrix_convert_dtype(input, new_dtype);
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method to access a single element of the matrix
+VALUE rb_matrix_get_element(int argc, VALUE *argv, VALUE self) {
+    Matrix *tensor;
+    Data_Get_Struct(self, Matrix, tensor);
+    if (tensor->rank == 0 || !tensor->dims) {
+        rb_raise(rb_eArgError, "Tensor has no dimensions");
+    }
+    if ((size_t)argc != tensor->rank) {
+        rb_raise(rb_eArgError, "Expected %zu indices, got %d", tensor->rank, argc);
+    }
+    size_t *strides = malloc(tensor->rank * sizeof(size_t));
+    if (!strides) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
+    }
+    tensor_compute_strides(tensor, strides);
+    size_t index = 0;
+    for (size_t d = 0; d < tensor->rank; d++) {
+        long idx = NUM2LONG(argv[d]);
+        long dim = (long)tensor->dims[d];
+        if (idx < 0) {
+            idx += dim; // negative indexing from the end
+        }
+        if (idx < 0 || idx >= dim) {
+            free(strides);
+            rb_raise(rb_eArgError, "Index out of bounds");
+        }
+        index += (size_t)idx * strides[d];
+    }
+    VALUE result = tensor_value_at(tensor, index);
+    free(strides);
+    return result;
+}
+// Ruby method to set a single element of the matrix
+VALUE rb_matrix_set_element(int argc, VALUE *argv, VALUE self) {
+    if (argc < 1) {
+        rb_raise(rb_eArgError, "Value required for assignment");
+    }
+    VALUE value = argv[argc - 1];
+    int index_count = argc - 1;
+    Matrix *tensor;
+    Data_Get_Struct(self, Matrix, tensor);
+    if (tensor->rank == 0 || !tensor->dims) {
+        rb_raise(rb_eArgError, "Tensor has no dimensions");
+    }
+    if ((size_t)index_count != tensor->rank) {
+        rb_raise(rb_eArgError, "Expected %zu indices, got %d", tensor->rank, index_count);
+    }
+    size_t *strides = malloc(tensor->rank * sizeof(size_t));
+    if (!strides) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
+    }
+    tensor_compute_strides(tensor, strides);
+    size_t index = 0;
+    for (size_t d = 0; d < tensor->rank; d++) {
+        long idx = NUM2LONG(argv[d]);
+        long dim = (long)tensor->dims[d];
+        if (idx < 0) {
+            idx += dim;
+        }
+        if (idx < 0 || idx >= dim) {
+            free(strides);
+            rb_raise(rb_eArgError, "Index out of bounds");
+        }
+        index += (size_t)idx * strides[d];
+    }
+    tensor_set_value_at(tensor, index, value);
+    free(strides);
+    return Qnil;
+}
+// Ruby method for matrix multiplication
+VALUE rb_matrix_multiply(VALUE self, VALUE other) {
+    Matrix *a, *b;
+    Data_Get_Struct(self, Matrix, a);
+    Data_Get_Struct(other, Matrix, b);
+    Matrix *result = matrix_multiply(a, b);
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method for matrix subtraction
+VALUE rb_matrix_subtract(VALUE self, VALUE other) {
+    Matrix *a, *b;
+    Data_Get_Struct(self, Matrix, a);
+    Data_Get_Struct(other, Matrix, b);
+    if (a->rows != b->rows || a->cols != b->cols || a->dtype != b->dtype) {
+        rb_raise(rb_eArgError, "Matrices must have the same dimensions and data types for subtraction");
+    }
+    if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "subtract supports only float32/float64 matrices");
+    }
+    Matrix *result = matrix_new(a->rows, a->cols, a->dtype);
+    size_t num_threads = get_num_threads();
+    if (a->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < a->cols; j++) {
+                ((double *)result->data)[i * a->cols + j] =
+                    ((double *)a->data)[i * a->cols + j] -
+                    ((double *)b->data)[i * a->cols + j];
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < a->cols; j++) {
+                ((float *)result->data)[i * a->cols + j] =
+                    ((float *)a->data)[i * a->cols + j] -
+                    ((float *)b->data)[i * a->cols + j];
+            }
+        }
+    }
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method for ReLU activation
+VALUE rb_matrix_relu(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    matrix_relu(matrix);
+    return self;
+}
+// Ruby method for ReLU gradient
+VALUE rb_matrix_relu_grad(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    Matrix *result = matrix_relu_grad(matrix);
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method for transpose
+VALUE rb_matrix_transpose(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "transpose supports only float32/float64 matrices");
+    }
+    Matrix *result = matrix_new(matrix->cols, matrix->rows, matrix->dtype);
+    size_t num_threads = get_num_threads();
+    if (matrix->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < matrix->rows; i++) {
+            for (size_t j = 0; j < matrix->cols; j++) {
+                ((double *)result->data)[j * result->cols + i] =
+                    ((double *)matrix->data)[i * matrix->cols + j];
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < matrix->rows; i++) {
+            for (size_t j = 0; j < matrix->cols; j++) {
+                ((float *)result->data)[j * result->cols + i] =
+                    ((float *)matrix->data)[i * matrix->cols + j];
+            }
+        }
+    }
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method for Hadamard product
+VALUE rb_matrix_hadamard(VALUE self, VALUE other) {
+    Matrix *a, *b;
+    Data_Get_Struct(self, Matrix, a);
+    Data_Get_Struct(other, Matrix, b);
+    if (a->rows != b->rows || a->cols != b->cols || a->dtype != b->dtype) {
+        rb_raise(rb_eArgError, "Matrices must have the same dimensions and data types for Hadamard product");
+    }
+    if (a->dtype != DTYPE_FLOAT64 && a->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "hadamard supports only float32/float64 matrices");
+    }
+    Matrix *result = matrix_new(a->rows, a->cols, a->dtype);
+    size_t num_threads = get_num_threads();
+    if (a->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < a->cols; j++) {
+                ((double *)result->data)[i * a->cols + j] =
+                    ((double *)a->data)[i * a->cols + j] *
+                    ((double *)b->data)[i * a->cols + j];
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < a->rows; i++) {
+            for (size_t j = 0; j < a->cols; j++) {
+                ((float *)result->data)[i * a->cols + j] =
+                    ((float *)a->data)[i * a->cols + j] *
+                    ((float *)b->data)[i * a->cols + j];
+            }
+        }
+    }
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method for scalar multiplication
+VALUE rb_matrix_scale(VALUE self, VALUE scalar) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    if (matrix->dtype != DTYPE_FLOAT64 && matrix->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "scale supports only float32/float64 matrices");
+    }
+    double s = NUM2DBL(scalar);
+    Matrix *result = matrix_new(matrix->rows, matrix->cols, matrix->dtype);
+    size_t num_threads = get_num_threads();
+    if (matrix->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < matrix->rows; i++) {
+            for (size_t j = 0; j < matrix->cols; j++) {
+                ((double *)result->data)[i * matrix->cols + j] =
+                    ((double *)matrix->data)[i * matrix->cols + j] * s;
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) num_threads(num_threads)
+        for (size_t i = 0; i < matrix->rows; i++) {
+            for (size_t j = 0; j < matrix->cols; j++) {
+                ((float *)result->data)[i * matrix->cols + j] =
+                    ((float *)matrix->data)[i * matrix->cols + j] * (float)s;
+            }
+        }
+    }
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby method to convert matrix to a 2D array
+VALUE rb_matrix_to_a(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    return matrix_to_a(matrix);
+}
+// Ruby methods for tensor metadata
+VALUE rb_tensor_shape(VALUE self) {
+    Matrix *tensor;
+    Data_Get_Struct(self, Matrix, tensor);
+    if (tensor->rank == 0 || !tensor->dims) {
+        return rb_ary_new();
+    }
+    VALUE ary = rb_ary_new_capa((long)tensor->rank);
+    for (size_t i = 0; i < tensor->rank; i++) {
+        rb_ary_push(ary, SIZET2NUM(tensor->dims[i]));
+    }
+    return ary;
+}
+VALUE rb_tensor_rank(VALUE self) {
+    Matrix *tensor;
+    Data_Get_Struct(self, Matrix, tensor);
+    return SIZET2NUM(tensor->rank);
+}
+VALUE rb_tensor_size(VALUE self) {
+    Matrix *tensor;
+    Data_Get_Struct(self, Matrix, tensor);
+    return SIZET2NUM(tensor_numel(tensor));
+}
+// Ruby method to extract a specific row from the matrix
+VALUE rb_matrix_row(VALUE self, VALUE row_index) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    size_t r = NUM2SIZET(row_index);
+    if (r >= matrix->rows) {
+        rb_raise(rb_eArgError, "Row index out of bounds");
+    }
+    VALUE row = rb_ary_new();
+    for (size_t j = 0; j < matrix->cols; j++) {
+        size_t index = r * matrix->cols + j;
+        if (matrix->dtype == DTYPE_FLOAT64) {
+            rb_ary_push(row, DBL2NUM(((double *)matrix->data)[index]));
+        } else if (matrix->dtype == DTYPE_FLOAT32) {
+            rb_ary_push(row, DBL2NUM((double)((float *)matrix->data)[index]));
+        } else if (matrix->dtype == DTYPE_INT16) {
+            rb_ary_push(row, DBL2NUM((double)((int16_t *)matrix->data)[index]));
+        } else if (matrix->dtype == DTYPE_INT8) {
+            rb_ary_push(row, DBL2NUM((double)((int8_t *)matrix->data)[index]));
+        } else {
+            rb_raise(rb_eArgError, "Unsupported data type in row");
+        }
+    }
+    return row;
+}
+// Ruby method to get the number of rows
+VALUE rb_matrix_row_count(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    return SIZET2NUM(matrix->rows);
+}
+// Ruby method to get the number of columns
+VALUE rb_matrix_column_count(VALUE self) {
+    Matrix *matrix;
+    Data_Get_Struct(self, Matrix, matrix);
+    return SIZET2NUM(matrix->cols);
+}
+// Create a matrix from a 2D array (Ruby Array of Arrays)
+Matrix *matrix_from_arrays(VALUE arrays, DataType dtype) {
+    Check_Type(arrays, T_ARRAY);
+    size_t rows = RARRAY_LEN(arrays);
+    if (rows == 0) {
+        rb_raise(rb_eArgError, "arrays must contain at least one row");
+    }
+    VALUE first_row = rb_ary_entry(arrays, 0);
+    Check_Type(first_row, T_ARRAY);
+    size_t cols = RARRAY_LEN(first_row);
+    if (cols == 0) {
+        rb_raise(rb_eArgError, "arrays must contain at least one column");
+    }
+    if (dtype != DTYPE_FLOAT64 && dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "from_arrays currently supports only float32/float64 dtypes");
+    }
+    // Allocate a new matrix with the specified data type
+    Matrix *matrix = matrix_new(rows, cols, dtype);
+    // Populate the matrix with data from the Ruby array
+    for (size_t i = 0; i < rows; i++) {
+        VALUE row = rb_ary_entry(arrays, i);
+        Check_Type(row, T_ARRAY);
+        if (RARRAY_LEN(row) != cols) {
+            rb_raise(rb_eArgError, "all rows must have the same length");
+        }
+        for (size_t j = 0; j < cols; j++) {
+            VALUE elem = rb_ary_entry(row, j);
+            if (dtype == DTYPE_FLOAT64) {
+                ((double *)matrix->data)[i * cols + j] = NUM2DBL(elem);
+            } else { // DTYPE_FLOAT32
+                ((float *)matrix->data)[i * cols + j] = (float)NUM2DBL(elem);
+            }
+        }
+    }
+    return matrix;
+}
+// Ruby class method to create a matrix from a 2D array
+VALUE rb_matrix_from_arrays(int argc, VALUE *argv, VALUE klass) {
+    VALUE arrays, dtype;
+    rb_scan_args(argc, argv, "11", &arrays, &dtype); // Accept 1 required argument and 1 optional argument
+    // Default to FLOAT64 if dtype is not provided
+    DataType dt = (NIL_P(dtype)) ? DTYPE_FLOAT64 : (NUM2INT(dtype) == 32) ? DTYPE_FLOAT32 : DTYPE_FLOAT64;
+    // Create the matrix and wrap it in a Ruby object
+    Matrix *matrix = matrix_from_arrays(arrays, dt);
+    return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
+}
+// Helper to allocate a tensor with arbitrary shape
+static Matrix *tensor_new_with_shape(size_t rank, const size_t *dims, DataType dtype) {
+    if (rank == 0) {
+        rb_raise(rb_eArgError, "Shape must have at least one dimension");
+    }
+    size_t total = 1;
+    for (size_t i = 0; i < rank; i++) {
+        if (dims[i] == 0) {
+            rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
+        }
+        total *= dims[i];
+    }
+    Matrix *tensor = malloc(sizeof(Matrix));
+    if (!tensor) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor");
+    }
+    tensor->rank = rank;
+    tensor->dims = malloc(rank * sizeof(size_t));
+    if (!tensor->dims) {
+        free(tensor);
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
+    }
+    for (size_t i = 0; i < rank; i++) {
+        tensor->dims[i] = dims[i];
+    }
+    // For compatibility with 2D code, set rows/cols so that rows * cols == total elements
+    if (rank == 1) {
+        tensor->rows = dims[0];
+        tensor->cols = 1;
+    } else {
+        tensor->rows = dims[0];
+        tensor->cols = total / tensor->rows;
+    }
+    tensor->dtype = dtype;
+    if (dtype == DTYPE_FLOAT64) {
+        tensor->data = calloc(total, sizeof(double));
+    } else if (dtype == DTYPE_FLOAT32) {
+        tensor->data = calloc(total, sizeof(float));
+    } else if (dtype == DTYPE_INT16) {
+        tensor->data = calloc(total, sizeof(int16_t));
+    } else if (dtype == DTYPE_INT8) {
+        tensor->data = calloc(total, sizeof(int8_t));
+    } else {
+        free(tensor->dims);
+        free(tensor);
+        rb_raise(rb_eArgError, "Unsupported data type");
+    }
+    if (!tensor->data) {
+        free(tensor->dims);
+        free(tensor);
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor data");
+    }
+    return tensor;
+}
+// Infer tensor shape from a nested Ruby array
+static void infer_shape(VALUE array, size_t *rank_out, size_t **dims_out) {
+    VALUE current = array;
+    size_t rank = 0;
+    // First pass: follow first elements to determine rank and dimensions
+    while (RB_TYPE_P(current, T_ARRAY)) {
+        long len = RARRAY_LEN(current);
+        if (len == 0) {
+            rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
+        }
+        rank++;
+        current = rb_ary_entry(current, 0);
+    }
+    size_t *dims = malloc(rank * sizeof(size_t));
+    if (!dims) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
+    }
+    current = array;
+    for (size_t i = 0; i < rank; i++) {
+        Check_Type(current, T_ARRAY);
+        long len = RARRAY_LEN(current);
+        if (len <= 0) {
+            free(dims);
+            rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
+        }
+        dims[i] = (size_t)len;
+        current = rb_ary_entry(current, 0);
+    }
+    *rank_out = rank;
+    *dims_out = dims;
+}
+// Recursive helper to fill a tensor from a nested Ruby array
+static void tensor_fill_from_array(VALUE value, Matrix *tensor, size_t depth, size_t base_index, const size_t *strides) {
+    if (depth == tensor->rank - 1) {
+        Check_Type(value, T_ARRAY);
+        if ((size_t)RARRAY_LEN(value) != tensor->dims[depth]) {
+            rb_raise(rb_eArgError, "All inner arrays must have the same length");
+        }
+        for (size_t i = 0; i < tensor->dims[depth]; i++) {
+            VALUE elem = rb_ary_entry(value, (long)i);
+            size_t index = base_index + i * strides[depth];
+            tensor_set_value_at(tensor, index, elem);
+        }
+    } else {
+        Check_Type(value, T_ARRAY);
+        if ((size_t)RARRAY_LEN(value) != tensor->dims[depth]) {
+            rb_raise(rb_eArgError, "All inner arrays must have the same length");
+        }
+        for (size_t i = 0; i < tensor->dims[depth]; i++) {
+            VALUE sub = rb_ary_entry(value, (long)i);
+            size_t next_base = base_index + i * strides[depth];
+            tensor_fill_from_array(sub, tensor, depth + 1, next_base, strides);
+        }
+    }
+}
+// Ruby class method to create a tensor from a nested Ruby array (N-D)
+VALUE rb_tensor_from_array(int argc, VALUE *argv, VALUE klass) {
+    VALUE array, dtype_arg;
+    rb_scan_args(argc, argv, "11", &array, &dtype_arg);
+    Check_Type(array, T_ARRAY);
+    const char *dtype_str = NIL_P(dtype_arg) ? "float32" : StringValueCStr(dtype_arg);
+    DataType dtype;
+    if (strcmp(dtype_str, "float64") == 0) {
+        dtype = DTYPE_FLOAT64;
+    } else if (strcmp(dtype_str, "float32") == 0) {
+        dtype = DTYPE_FLOAT32;
+    } else if (strcmp(dtype_str, "int16") == 0) {
+        dtype = DTYPE_INT16;
+    } else if (strcmp(dtype_str, "int8") == 0) {
+        dtype = DTYPE_INT8;
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
+    }
+    size_t rank;
+    size_t *dims;
+    infer_shape(array, &rank, &dims);
+    Matrix *tensor = tensor_new_with_shape(rank, dims, dtype);
+    size_t *strides = malloc(rank * sizeof(size_t));
+    if (!strides) {
+        free(dims);
+        matrix_free(tensor);
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor strides");
+    }
+    tensor_compute_strides(tensor, strides);
+    tensor_fill_from_array(array, tensor, 0, 0, strides);
+    free(dims);
+    free(strides);
+    return Data_Wrap_Struct(klass, NULL, matrix_free, tensor);
+}
+// Softmax computation for a matrix
+Matrix *matrix_softmax(const Matrix *input) {
+    if (input->dtype != DTYPE_FLOAT64 && input->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "softmax supports only float32/float64 matrices");
+    }
+    // Create a result matrix with the same dimensions as the input
+    Matrix *result = matrix_new(input->rows, input->cols, input->dtype);
+    if (input->dtype == DTYPE_FLOAT64) {
+        for (size_t i = 0; i < input->rows; i++) {
+            // Find the maximum value in the row for numerical stability
+            double max_val = -INFINITY;
+            for (size_t j = 0; j < input->cols; j++) {
+                double val = ((double *)input->data)[i * input->cols + j];
+                if (val > max_val) max_val = val;
+            }
+            // Compute the sum of exponentials
+            double sum = 0.0;
+            for (size_t j = 0; j < input->cols; j++) {
+                double exp_val = exp(((double *)input->data)[i * input->cols + j] - max_val);
+                sum += exp_val;
+                ((double *)result->data)[i * input->cols + j] = exp_val;
+            }
+            // Normalize by dividing each element by the sum
+            for (size_t j = 0; j < input->cols; j++) {
+                ((double *)result->data)[i * input->cols + j] /= sum;
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        for (size_t i = 0; i < input->rows; i++) {
+            // Find the maximum value in the row for numerical stability
+            float max_val = -INFINITY;
+            for (size_t j = 0; j < input->cols; j++) {
+                float val = ((float *)input->data)[i * input->cols + j];
+                if (val > max_val) max_val = val;
+            }
+            // Compute the sum of exponentials
+            float sum = 0.0f;
+            for (size_t j = 0; j < input->cols; j++) {
+                float exp_val = expf(((float *)input->data)[i * input->cols + j] - max_val);
+                sum += exp_val;
+                ((float *)result->data)[i * input->cols + j] = exp_val;
+            }
+            // Normalize by dividing each element by the sum
+            for (size_t j = 0; j < input->cols; j++) {
+                ((float *)result->data)[i * input->cols + j] /= sum;
+            }
+        }
+    }
+    return result;
+}
+// Cross-entropy loss and gradient computation
+typedef struct {
+    double loss;
+    Matrix *gradient;
+} LossGradient;
+LossGradient *matrix_cross_entropy_loss(const Matrix *probs, const int *labels, size_t batch_size) {
+    LossGradient *result = malloc(sizeof(LossGradient));
+    if (!result) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for LossGradient");
+    }
+    if (probs->dtype != DTYPE_FLOAT64 && probs->dtype != DTYPE_FLOAT32) {
+        rb_raise(rb_eArgError, "cross_entropy_loss supports only float32/float64 matrices");
+    }
+    result->gradient = matrix_new(probs->rows, probs->cols, probs->dtype);
+    double loss = 0.0;
+    if (probs->dtype == DTYPE_FLOAT64) {
+        #pragma omp parallel for schedule(dynamic) reduction(+:loss) num_threads(get_num_threads())
+        for (size_t i = 0; i < batch_size; i++) {
+            if (labels[i] < 0 || (size_t)labels[i] >= probs->cols) {
+                rb_raise(rb_eArgError, "Label index out of bounds");
+            }
+            double prob = ((double *)probs->data)[i * probs->cols + labels[i]];
+            loss -= log(fmax(prob, 1e-7)); // Add a small epsilon to avoid log(0)
+            for (size_t j = 0; j < probs->cols; j++) {
+                double grad_val = ((double *)probs->data)[i * probs->cols + j];
+                if ((int)j == labels[i]) grad_val -= 1.0;
+                ((double *)result->gradient->data)[i * probs->cols + j] = grad_val / batch_size;
+            }
+        }
+    } else { // DTYPE_FLOAT32
+        #pragma omp parallel for schedule(dynamic) reduction(+:loss) num_threads(get_num_threads())
+        for (size_t i = 0; i < batch_size; i++) {
+            if (labels[i] < 0 || (size_t)labels[i] >= probs->cols) {
+                rb_raise(rb_eArgError, "Label index out of bounds");
+            }
+            float prob = ((float *)probs->data)[i * probs->cols + labels[i]];
+            loss -= logf(fmaxf(prob, 1e-7f)); // Add a small epsilon to avoid log(0)
+            for (size_t j = 0; j < probs->cols; j++) {
+                float grad_val = ((float *)probs->data)[i * probs->cols + j];
+                if ((int)j == labels[i]) grad_val -= 1.0f;
+                ((float *)result->gradient->data)[i * probs->cols + j] = grad_val / batch_size;
+            }
+        }
+    }
+    result->loss = loss / batch_size;
+    return result;
+}
+VALUE rb_matrix_cross_entropy_loss(VALUE self, VALUE labels) {
+    Matrix *probs;
+    Data_Get_Struct(self, Matrix, probs);
+    Check_Type(labels, T_ARRAY);
+    size_t batch_size = RARRAY_LEN(labels);
+    if (batch_size != probs->rows) {
+        rb_raise(rb_eArgError, "labels size must match number of rows in probabilities");
+    }
+    int *c_labels = malloc(batch_size * sizeof(int));
+    for (size_t i = 0; i < batch_size; i++) {
+        c_labels[i] = NUM2INT(rb_ary_entry(labels, i));
+    }
+    LossGradient *result = matrix_cross_entropy_loss(probs, c_labels, batch_size);
+    free(c_labels);
+    // Return an array containing [loss, gradient]
+    VALUE ret = rb_ary_new2(2);
+    rb_ary_store(ret, 0, DBL2NUM(result->loss));
+    rb_ary_store(ret, 1, Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result->gradient));
+    free(result);
+    return ret;
+}
+// Ruby method for softmax
+VALUE rb_matrix_softmax(VALUE self) {
+    Matrix *input;
+    Data_Get_Struct(self, Matrix, input);
+    Matrix *result = matrix_softmax(input);
+    return Data_Wrap_Struct(rb_tensor_class, NULL, matrix_free, result);
+}
+// Ruby class method to build a matrix using a block
+VALUE rb_matrix_build(int argc, VALUE *argv, VALUE klass) {
+    VALUE rows, cols, kwargs;
+    rb_scan_args(argc, argv, "2:", &rows, &cols, &kwargs);
+    size_t r = NUM2SIZET(rows);
+    size_t c = NUM2SIZET(cols);
+    if (r == 0 || c == 0) {
+        rb_raise(rb_eArgError, "Rows and columns must be greater than zero");
+    }
+    if (!rb_block_given_p()) {
+        rb_raise(rb_eArgError, "Block is required for building the matrix");
+    }
+    // Default dtype is FLOAT64
+    DataType dtype = DTYPE_FLOAT64;
+    // Parse dtype from kwargs if provided
+    if (!NIL_P(kwargs)) {
+        VALUE dtype_arg = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
+        if (!NIL_P(dtype_arg)) {
+            const char *dtype_str = StringValueCStr(dtype_arg);
+            if (strcmp(dtype_str, "float64") == 0) {
+                dtype = DTYPE_FLOAT64;
+            } else if (strcmp(dtype_str, "float32") == 0) {
+                dtype = DTYPE_FLOAT32;
+            } else if (strcmp(dtype_str, "int16") == 0) {
+                dtype = DTYPE_INT16;
+            } else if (strcmp(dtype_str, "int8") == 0) {
+                dtype = DTYPE_INT8;
+            } else {
+                rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
+            }
+        }
+    }
+    // Create a new matrix with the specified data type
+    Matrix *matrix = matrix_new(r, c, dtype);
+    // Populate the matrix using the block
+    for (size_t i = 0; i < r; i++) {
+        for (size_t j = 0; j < c; j++) {
+            VALUE args[2] = {SIZET2NUM(i), SIZET2NUM(j)};
+            VALUE result = rb_yield_values2(2, args);
+            if (NIL_P(result)) {
+                matrix_free(matrix);
+                rb_raise(rb_eArgError, "Block must return a numeric value");
+            }
+            if (matrix->dtype == DTYPE_FLOAT64) {
+                ((double *)matrix->data)[i * c + j] = NUM2DBL(result);
+            } else if (matrix->dtype == DTYPE_FLOAT32) {
+                ((float *)matrix->data)[i * c + j] = (float)NUM2DBL(result);
+            } else if (matrix->dtype == DTYPE_INT16) {
+                ((int16_t *)matrix->data)[i * c + j] = (int16_t)NUM2INT(result);
+            } else if (matrix->dtype == DTYPE_INT8) {
+                ((int8_t *)matrix->data)[i * c + j] = (int8_t)NUM2INT(result);
+            }
+        }
+    }
+    return Data_Wrap_Struct(klass, NULL, matrix_free, matrix);
+}
+// Ruby class method: Tensor.zeros(shape:, dtype:)
+VALUE rb_tensor_zeros(int argc, VALUE *argv, VALUE klass) {
+    VALUE kwargs;
+    rb_scan_args(argc, argv, "0:", &kwargs);
+    if (NIL_P(kwargs)) {
+        rb_raise(rb_eArgError, "Keyword arguments are required");
+    }
+    VALUE shape_val = rb_hash_aref(kwargs, ID2SYM(rb_intern("shape")));
+    if (NIL_P(shape_val)) {
+        rb_raise(rb_eArgError, "Missing keyword: shape");
+    }
+    Check_Type(shape_val, T_ARRAY);
+    VALUE dtype_val = rb_hash_aref(kwargs, ID2SYM(rb_intern("dtype")));
+    const char *dtype_str = NIL_P(dtype_val) ? "float32" : StringValueCStr(dtype_val);
+    DataType dtype;
+    if (strcmp(dtype_str, "float64") == 0) {
+        dtype = DTYPE_FLOAT64;
+    } else if (strcmp(dtype_str, "float32") == 0) {
+        dtype = DTYPE_FLOAT32;
+    } else if (strcmp(dtype_str, "int16") == 0) {
+        dtype = DTYPE_INT16;
+    } else if (strcmp(dtype_str, "int8") == 0) {
+        dtype = DTYPE_INT8;
+    } else {
+        rb_raise(rb_eArgError, "Unsupported data type: %s", dtype_str);
+    }
+    size_t rank = (size_t)RARRAY_LEN(shape_val);
+    if (rank == 0) {
+        rb_raise(rb_eArgError, "Shape must have at least one dimension");
+    }
+    size_t *dims = malloc(rank * sizeof(size_t));
+    if (!dims) {
+        rb_raise(rb_eNoMemError, "Failed to allocate memory for tensor dimensions");
+    }
+    for (size_t i = 0; i < rank; i++) {
+        VALUE dim_val = rb_ary_entry(shape_val, (long)i);
+        size_t dim = NUM2SIZET(dim_val);
+        if (dim == 0) {
+            free(dims);
+            rb_raise(rb_eArgError, "All tensor dimensions must be greater than zero");
+        }
+        dims[i] = dim;
+    }
+    Matrix *tensor = tensor_new_with_shape(rank, dims, dtype);
+    free(dims);
+    return Data_Wrap_Struct(klass, NULL, matrix_free, tensor);
+}
+void Init_tensor_ext() {
+    // Define the Tensor class (and keep Matrix as an alias)
+    rb_tensor_class = rb_define_class("Tensor", rb_cObject);
+    rb_define_const(rb_cObject, "Matrix", rb_tensor_class);
+    rb_define_alloc_func(rb_tensor_class, rb_matrix_allocate);
+    // Class Methods
+    rb_define_singleton_method(rb_tensor_class, "from_arrays", rb_matrix_from_arrays, -1);
+    rb_define_singleton_method(rb_tensor_class, "build", rb_matrix_build, -1);
+    rb_define_singleton_method(rb_tensor_class, "zeros", rb_tensor_zeros, -1);
+    rb_define_singleton_method(rb_tensor_class, "from_array", rb_tensor_from_array, -1);
+    // Instance Methods
+    rb_define_method(rb_tensor_class, "initialize", rb_matrix_initialize, -1);
+    rb_define_method(rb_tensor_class, "multiply", rb_matrix_multiply, 1);
+    rb_define_method(rb_tensor_class, "subtract", rb_matrix_subtract, 1);
+    rb_define_method(rb_tensor_class, "relu", rb_matrix_relu, 0);
+    rb_define_method(rb_tensor_class, "relu_grad", rb_matrix_relu_grad, 0);
+    rb_define_method(rb_tensor_class, "transpose", rb_matrix_transpose, 0);
+    rb_define_method(rb_tensor_class, "hadamard", rb_matrix_hadamard, 1);
+    rb_define_method(rb_tensor_class, "scale", rb_matrix_scale, 1);
+    rb_define_method(rb_tensor_class, "to_a", rb_matrix_to_a, 0);
+    rb_define_method(rb_tensor_class, "shape", rb_tensor_shape, 0);
+    rb_define_method(rb_tensor_class, "rank", rb_tensor_rank, 0);
+    rb_define_method(rb_tensor_class, "size", rb_tensor_size, 0);
+    rb_define_method(rb_tensor_class, "row_count", rb_matrix_row_count, 0);
+    rb_define_method(rb_tensor_class, "column_count", rb_matrix_column_count, 0);
+    rb_define_method(rb_tensor_class, "row", rb_matrix_row, 1);
+    rb_define_method(rb_tensor_class, "softmax", rb_matrix_softmax, 0);
+    rb_define_method(rb_tensor_class, "cross_entropy_loss", rb_matrix_cross_entropy_loss, 1);
+    rb_define_method(rb_tensor_class, "to_dtype", rb_matrix_convert_dtype, 1); // New method
+    rb_define_method(rb_tensor_class, "[]", rb_matrix_get_element, -1);   // Getter
+    rb_define_method(rb_tensor_class, "[]=", rb_matrix_set_element, -1); // Setter
+    // Alias Methods
+    rb_define_alias(rb_tensor_class, "matmul", "multiply");
+}