PyPI - openarchx - Versions diffs - 0.1.0__py3-none-any.whl - Mend

openarchx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

openarchx/__init__.py +11 -0
openarchx/core/tensor.py +179 -0
openarchx/cuda/__init__.py +27 -0
openarchx/cuda/cuda_ops.py +296 -0
openarchx/layers/activations.py +63 -0
openarchx/layers/base.py +40 -0
openarchx/layers/cnn.py +145 -0
openarchx/layers/transformer.py +131 -0
openarchx/nn/__init__.py +26 -0
openarchx/nn/activations.py +127 -0
openarchx/nn/containers.py +174 -0
openarchx/nn/dropout.py +121 -0
openarchx/nn/layers.py +338 -0
openarchx/nn/losses.py +156 -0
openarchx/nn/module.py +18 -0
openarchx/nn/padding.py +120 -0
openarchx/nn/pooling.py +318 -0
openarchx/nn/rnn.py +226 -0
openarchx/nn/transformers.py +187 -0
openarchx/optimizers/adam.py +49 -0
openarchx/optimizers/adaptive.py +63 -0
openarchx/optimizers/base.py +24 -0
openarchx/optimizers/modern.py +98 -0
openarchx/optimizers/optx.py +91 -0
openarchx/optimizers/sgd.py +63 -0
openarchx/quantum/circuit.py +92 -0
openarchx/quantum/gates.py +126 -0
openarchx/utils/__init__.py +50 -0
openarchx/utils/data.py +229 -0
openarchx/utils/huggingface.py +288 -0
openarchx/utils/losses.py +21 -0
openarchx/utils/model_io.py +553 -0
openarchx/utils/pytorch.py +420 -0
openarchx/utils/tensorflow.py +467 -0
openarchx/utils/transforms.py +259 -0
openarchx-0.1.0.dist-info/METADATA +180 -0
openarchx-0.1.0.dist-info/RECORD +43 -0
openarchx-0.1.0.dist-info/WHEEL +5 -0
openarchx-0.1.0.dist-info/licenses/LICENSE +21 -0
openarchx-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +1 -0
tests/test_cuda_ops.py +205 -0
tests/test_integrations.py +236 -0

openarchx/nn/padding.py ADDED Viewed

@@ -0,0 +1,120 @@
+import numpy as np
+from ..core.tensor import Tensor
+from .module import Module
+class _ConstantPadNd(Module):
+    def __init__(self, value):
+        super().__init__()
+        self.value = value
+    def _pad_array(self, x, pad_width):
+        return np.pad(x.data, pad_width, mode='constant', constant_values=self.value)
+class ConstantPad1d(_ConstantPadNd):
+    def __init__(self, padding, value=0):
+        super().__init__(value)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+    def forward(self, x):
+        pad_width = ((0, 0),) * (len(x.data.shape) - 1) + (self.padding,)
+        return Tensor(self._pad_array(x, pad_width), requires_grad=True)
+class ConstantPad2d(_ConstantPadNd):
+    def __init__(self, padding, value=0):
+        super().__init__(value)
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding))
+        elif len(padding) == 2:
+            self.padding = ((padding[0], padding[0]), (padding[1], padding[1]))
+        else:
+            self.padding = ((padding[0], padding[1]), (padding[2], padding[3]))
+    def forward(self, x):
+        pad_width = ((0, 0), (0, 0)) + self.padding
+        return Tensor(self._pad_array(x, pad_width), requires_grad=True)
+class ConstantPad3d(_ConstantPadNd):
+    def __init__(self, padding, value=0):
+        super().__init__(value)
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding), (padding, padding))
+        elif len(padding) == 3:
+            self.padding = ((padding[0], padding[0]),
+                          (padding[1], padding[1]),
+                          (padding[2], padding[2]))
+        else:
+            self.padding = ((padding[0], padding[1]),
+                          (padding[2], padding[3]),
+                          (padding[4], padding[5]))
+    def forward(self, x):
+        pad_width = ((0, 0), (0, 0)) + self.padding
+        return Tensor(self._pad_array(x, pad_width), requires_grad=True)
+class ReflectionPad1d(Module):
+    def __init__(self, padding):
+        super().__init__()
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+    def forward(self, x):
+        pad_width = ((0, 0),) * (len(x.data.shape) - 1) + (self.padding,)
+        return Tensor(np.pad(x.data, pad_width, mode='reflect'), requires_grad=True)
+class ReflectionPad2d(Module):
+    def __init__(self, padding):
+        super().__init__()
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding))
+        elif len(padding) == 2:
+            self.padding = ((padding[0], padding[0]), (padding[1], padding[1]))
+        else:
+            self.padding = ((padding[0], padding[1]), (padding[2], padding[3]))
+    def forward(self, x):
+        pad_width = ((0, 0), (0, 0)) + self.padding
+        return Tensor(np.pad(x.data, pad_width, mode='reflect'), requires_grad=True)
+class ReplicationPad1d(Module):
+    def __init__(self, padding):
+        super().__init__()
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+    def forward(self, x):
+        pad_width = ((0, 0),) * (len(x.data.shape) - 1) + (self.padding,)
+        return Tensor(np.pad(x.data, pad_width, mode='edge'), requires_grad=True)
+class ReplicationPad2d(Module):
+    def __init__(self, padding):
+        super().__init__()
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding))
+        elif len(padding) == 2:
+            self.padding = ((padding[0], padding[0]), (padding[1], padding[1]))
+        else:
+            self.padding = ((padding[0], padding[1]), (padding[2], padding[3]))
+    def forward(self, x):
+        pad_width = ((0, 0), (0, 0)) + self.padding
+        return Tensor(np.pad(x.data, pad_width, mode='edge'), requires_grad=True)
+class ReplicationPad3d(Module):
+    def __init__(self, padding):
+        super().__init__()
+        if isinstance(padding, int):
+            self.padding = ((padding, padding), (padding, padding), (padding, padding))
+        elif len(padding) == 3:
+            self.padding = ((padding[0], padding[0]),
+                          (padding[1], padding[1]),
+                          (padding[2], padding[2]))
+        else:
+            self.padding = ((padding[0], padding[1]),
+                          (padding[2], padding[3]),
+                          (padding[4], padding[5]))
+    def forward(self, x):
+        pad_width = ((0, 0), (0, 0)) + self.padding
+        return Tensor(np.pad(x.data, pad_width, mode='edge'), requires_grad=True)
+class ZeroPad2d(ConstantPad2d):
+    def __init__(self, padding):
+        super().__init__(padding, value=0)

openarchx/nn/pooling.py ADDED Viewed

@@ -0,0 +1,318 @@
+import numpy as np
+from ..core.tensor import Tensor
+from .module import Module
+class MaxPool1d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+    def forward(self, x):
+        # TODO: Implement MaxPool1d forward pass
+        pass
+class MaxPool2d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        self.stride = stride if stride is not None else self.kernel_size
+        self.stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        pad_h, pad_w = self.padding
+        stride_h, stride_w = self.stride
+        kernel_h, kernel_w = self.kernel_size
+        # Add padding if needed
+        if pad_h > 0 or pad_w > 0:
+            x_padded = np.pad(x.data, ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w)), mode='constant')
+        else:
+            x_padded = x.data
+        # Calculate output dimensions
+        out_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
+        out_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
+        # Prepare output array
+        out = np.zeros((batch_size, channels, out_height, out_width))
+        # Perform max pooling
+        for b in range(batch_size):
+            for c in range(channels):
+                for h in range(out_height):
+                    for w in range(out_width):
+                        h_start = h * stride_h
+                        w_start = w * stride_w
+                        h_end = h_start + kernel_h
+                        w_end = w_start + kernel_w
+                        pool_region = x_padded[b, c, h_start:h_end, w_start:w_end]
+                        out[b, c, h, w] = np.max(pool_region)
+        return Tensor(out, requires_grad=True)
+class AvgPool1d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+    def forward(self, x):
+        # TODO: Implement AvgPool1d forward pass
+        pass
+class AvgPool2d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        self.stride = stride if stride is not None else self.kernel_size
+        self.stride = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        pad_h, pad_w = self.padding
+        stride_h, stride_w = self.stride
+        kernel_h, kernel_w = self.kernel_size
+        # Add padding if needed
+        if pad_h > 0 or pad_w > 0:
+            x_padded = np.pad(x.data, ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w)), mode='constant')
+        else:
+            x_padded = x.data
+        # Calculate output dimensions
+        out_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
+        out_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
+        # Prepare output array
+        out = np.zeros((batch_size, channels, out_height, out_width))
+        # Perform average pooling
+        for b in range(batch_size):
+            for c in range(channels):
+                for h in range(out_height):
+                    for w in range(out_width):
+                        h_start = h * stride_h
+                        w_start = w * stride_w
+                        h_end = h_start + kernel_h
+                        w_end = w_start + kernel_w
+                        pool_region = x_padded[b, c, h_start:h_end, w_start:w_end]
+                        out[b, c, h, w] = np.mean(pool_region)
+        return Tensor(out, requires_grad=True)
+class AdaptiveAvgPool2d(Module):
+    def __init__(self, output_size):
+        super().__init__()
+        self.output_size = output_size if isinstance(output_size, tuple) else (output_size, output_size)
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        out_h, out_w = self.output_size
+        # Calculate the kernel and stride sizes
+        stride_h = height // out_h
+        stride_w = width // out_w
+        kernel_h = height - (out_h - 1) * stride_h
+        kernel_w = width - (out_w - 1) * stride_w
+        # Prepare output array
+        out = np.zeros((batch_size, channels, out_h, out_w))
+        # Perform adaptive average pooling
+        for b in range(batch_size):
+            for c in range(channels):
+                for h in range(out_h):
+                    for w in range(out_w):
+                        h_start = h * stride_h
+                        w_start = w * stride_w
+                        h_end = min(h_start + kernel_h, height)
+                        w_end = min(w_start + kernel_w, width)
+                        pool_region = x.data[b, c, h_start:h_end, w_start:w_end]
+                        out[b, c, h, w] = np.mean(pool_region)
+        return Tensor(out, requires_grad=True)
+class AdaptiveMaxPool2d(Module):
+    def __init__(self, output_size):
+        super().__init__()
+        self.output_size = output_size if isinstance(output_size, tuple) else (output_size, output_size)
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        out_h, out_w = self.output_size
+        # Calculate the kernel and stride sizes
+        stride_h = height // out_h
+        stride_w = width // out_w
+        kernel_h = height - (out_h - 1) * stride_h
+        kernel_w = width - (out_w - 1) * stride_w
+        # Prepare output array
+        out = np.zeros((batch_size, channels, out_h, out_w))
+        # Perform adaptive max pooling
+        for b in range(batch_size):
+            for c in range(channels):
+                for h in range(out_h):
+                    for w in range(out_w):
+                        h_start = h * stride_h
+                        w_start = w * stride_w
+                        h_end = min(h_start + kernel_h, height)
+                        w_end = min(w_start + kernel_w, width)
+                        pool_region = x.data[b, c, h_start:h_end, w_start:w_end]
+                        out[b, c, h, w] = np.max(pool_region)
+        return Tensor(out, requires_grad=True)
+class FractionalMaxPool2d(Module):
+    def __init__(self, kernel_size, output_size=None, output_ratio=None, return_indices=False):
+        super().__init__()
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        self.output_size = output_size
+        self.output_ratio = output_ratio
+        self.return_indices = return_indices
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        if self.output_size is not None:
+            out_h, out_w = self.output_size
+        else:
+            out_h = int(height * self.output_ratio[0])
+            out_w = int(width * self.output_ratio[1])
+        # Generate random pooling regions
+        h_indices = np.linspace(0, height - self.kernel_size[0], out_h, dtype=int)
+        w_indices = np.linspace(0, width - self.kernel_size[1], out_w, dtype=int)
+        out = np.zeros((batch_size, channels, out_h, out_w))
+        indices = np.zeros((batch_size, channels, out_h, out_w, 2), dtype=int) if self.return_indices else None
+        for b in range(batch_size):
+            for c in range(channels):
+                for i, h_idx in enumerate(h_indices):
+                    for j, w_idx in enumerate(w_indices):
+                        region = x.data[b, c,
+                                h_idx:h_idx + self.kernel_size[0],
+                                w_idx:w_idx + self.kernel_size[1]]
+                        out[b, c, i, j] = np.max(region)
+                        if self.return_indices:
+                            max_idx = np.unravel_index(np.argmax(region), region.shape)
+                            indices[b, c, i, j] = [h_idx + max_idx[0], w_idx + max_idx[1]]
+        if self.return_indices:
+            return Tensor(out, requires_grad=True), indices
+        return Tensor(out, requires_grad=True)
+class FractionalMaxPool3d(Module):
+    def __init__(self, kernel_size, output_size=None, output_ratio=None, return_indices=False):
+        super().__init__()
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
+        self.output_size = output_size
+        self.output_ratio = output_ratio
+        self.return_indices = return_indices
+    def forward(self, x):
+        batch_size, channels, depth, height, width = x.data.shape
+        if self.output_size is not None:
+            out_d, out_h, out_w = self.output_size
+        else:
+            out_d = int(depth * self.output_ratio[0])
+            out_h = int(height * self.output_ratio[1])
+            out_w = int(width * self.output_ratio[2])
+        # Generate random pooling regions
+        d_indices = np.linspace(0, depth - self.kernel_size[0], out_d, dtype=int)
+        h_indices = np.linspace(0, height - self.kernel_size[1], out_h, dtype=int)
+        w_indices = np.linspace(0, width - self.kernel_size[2], out_w, dtype=int)
+        out = np.zeros((batch_size, channels, out_d, out_h, out_w))
+        indices = np.zeros((batch_size, channels, out_d, out_h, out_w, 3), dtype=int) if self.return_indices else None
+        for b in range(batch_size):
+            for c in range(channels):
+                for i, d_idx in enumerate(d_indices):
+                    for j, h_idx in enumerate(h_indices):
+                        for k, w_idx in enumerate(w_indices):
+                            region = x.data[b, c,
+                                    d_idx:d_idx + self.kernel_size[0],
+                                    h_idx:h_idx + self.kernel_size[1],
+                                    w_idx:w_idx + self.kernel_size[2]]
+                            out[b, c, i, j, k] = np.max(region)
+                            if self.return_indices:
+                                max_idx = np.unravel_index(np.argmax(region), region.shape)
+                                indices[b, c, i, j, k] = [d_idx + max_idx[0],
+                                                        h_idx + max_idx[1],
+                                                        w_idx + max_idx[2]]
+        if self.return_indices:
+            return Tensor(out, requires_grad=True), indices
+        return Tensor(out, requires_grad=True)
+class LPPool1d(Module):
+    def __init__(self, norm_type, kernel_size, stride=None):
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+    def forward(self, x):
+        batch_size, channels, length = x.data.shape
+        out_length = (length - self.kernel_size) // self.stride + 1
+        out = np.zeros((batch_size, channels, out_length))
+        for b in range(batch_size):
+            for c in range(channels):
+                for i in range(out_length):
+                    start_idx = i * self.stride
+                    end_idx = start_idx + self.kernel_size
+                    region = x.data[b, c, start_idx:end_idx]
+                    out[b, c, i] = np.power(np.sum(np.power(np.abs(region), self.norm_type)),
+                                          1.0 / self.norm_type)
+        return Tensor(out, requires_grad=True)
+class LPPool2d(Module):
+    def __init__(self, norm_type, kernel_size, stride=None):
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        self.stride = stride if stride is not None else self.kernel_size
+    def forward(self, x):
+        batch_size, channels, height, width = x.data.shape
+        stride_h, stride_w = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
+        kernel_h, kernel_w = self.kernel_size
+        out_height = (height - kernel_h) // stride_h + 1
+        out_width = (width - kernel_w) // stride_w + 1
+        out = np.zeros((batch_size, channels, out_height, out_width))
+        for b in range(batch_size):
+            for c in range(channels):
+                for i in range(out_height):
+                    for j in range(out_width):
+                        start_h = i * stride_h
+                        start_w = j * stride_w
+                        region = x.data[b, c,
+                                start_h:start_h + kernel_h,
+                                start_w:start_w + kernel_w]
+                        out[b, c, i, j] = np.power(
+                            np.sum(np.power(np.abs(region), self.norm_type)),
+                            1.0 / self.norm_type
+                        )
+        return Tensor(out, requires_grad=True)

openarchx/nn/rnn.py ADDED Viewed

@@ -0,0 +1,226 @@
+import numpy as np
+from ..core.tensor import Tensor
+from .module import Module
+from .layers import Linear
+class RNNCell(Module):
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.nonlinearity = nonlinearity
+        self.ih = Linear(input_size, hidden_size, bias=bias)
+        self.hh = Linear(hidden_size, hidden_size, bias=bias)
+    def forward(self, x, h=None):
+        if h is None:
+            h = Tensor(np.zeros((x.data.shape[0], self.hidden_size)), requires_grad=True)
+        hidden = self.ih(x) + self.hh(h)
+        if self.nonlinearity == "tanh":
+            hidden = Tensor(np.tanh(hidden.data), requires_grad=True)
+        else:  # relu
+            hidden = Tensor(np.maximum(0, hidden.data), requires_grad=True)
+        return hidden
+class LSTMCell(Module):
+    def __init__(self, input_size, hidden_size, bias=True):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.ih = Linear(input_size, 4 * hidden_size, bias=bias)
+        self.hh = Linear(hidden_size, 4 * hidden_size, bias=bias)
+    def forward(self, x, state=None):
+        if state is None:
+            h = Tensor(np.zeros((x.data.shape[0], self.hidden_size)), requires_grad=True)
+            c = Tensor(np.zeros((x.data.shape[0], self.hidden_size)), requires_grad=True)
+        else:
+            h, c = state
+        gates = self.ih(x) + self.hh(h)
+        # Split gates
+        i, f, g, o = np.split(gates.data, 4, axis=1)
+        # Apply activations
+        i = 1 / (1 + np.exp(-i))  # input gate
+        f = 1 / (1 + np.exp(-f))  # forget gate
+        g = np.tanh(g)            # cell gate
+        o = 1 / (1 + np.exp(-o))  # output gate
+        # Update cell state
+        c = Tensor(f * c.data + i * g, requires_grad=True)
+        # Compute output
+        h = Tensor(o * np.tanh(c.data), requires_grad=True)
+        return h, c
+class GRUCell(Module):
+    def __init__(self, input_size, hidden_size, bias=True):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.ih = Linear(input_size, 3 * hidden_size, bias=bias)
+        self.hh = Linear(hidden_size, 3 * hidden_size, bias=bias)
+    def forward(self, x, h=None):
+        if h is None:
+            h = Tensor(np.zeros((x.data.shape[0], self.hidden_size)), requires_grad=True)
+        gi = self.ih(x)
+        gh = self.hh(h)
+        # Split gates
+        i_r, i_z, i_n = np.split(gi.data, 3, axis=1)
+        h_r, h_z, h_n = np.split(gh.data, 3, axis=1)
+        r = 1 / (1 + np.exp(-(i_r + h_r)))  # reset gate
+        z = 1 / (1 + np.exp(-(i_z + h_z)))  # update gate
+        n = np.tanh(i_n + r * h_n)          # new gate
+        h = Tensor((1 - z) * n + z * h.data, requires_grad=True)
+        return h
+class RNN(Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, bias=True,
+                 nonlinearity="tanh", bidirectional=False):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.cells = []
+        for layer in range(num_layers):
+            layer_input_size = input_size if layer == 0 else hidden_size * (2 if bidirectional else 1)
+            self.cells.append(RNNCell(layer_input_size, hidden_size, bias, nonlinearity))
+            if bidirectional:
+                self.cells.append(RNNCell(layer_input_size, hidden_size, bias, nonlinearity))
+    def forward(self, x, h=None):
+        # Assuming x is of shape (batch, seq_len, input_size)
+        seq_len = x.data.shape[1]
+        batch_size = x.data.shape[0]
+        num_directions = 2 if self.bidirectional else 1
+        if h is None:
+            h = [Tensor(np.zeros((batch_size, self.hidden_size)), requires_grad=True)
+                 for _ in range(self.num_layers * num_directions)]
+        output = []
+        for t in range(seq_len):
+            x_t = Tensor(x.data[:, t, :], requires_grad=True)
+            for layer in range(self.num_layers):
+                idx = layer * num_directions
+                h[idx] = self.cells[idx](x_t, h[idx])
+                if self.bidirectional:
+                    h[idx + 1] = self.cells[idx + 1](x_t, h[idx + 1])
+                # Prepare input for next layer
+                if self.bidirectional:
+                    x_t = Tensor(np.concatenate([h[idx].data, h[idx + 1].data], axis=1), requires_grad=True)
+                else:
+                    x_t = h[idx]
+            output.append(x_t.data)
+        # Stack outputs along sequence dimension
+        output = Tensor(np.stack(output, axis=1), requires_grad=True)
+        return output, h
+class LSTM(Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, bidirectional=False):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.cells = []
+        for layer in range(num_layers):
+            layer_input_size = input_size if layer == 0 else hidden_size * (2 if bidirectional else 1)
+            self.cells.append(LSTMCell(layer_input_size, hidden_size, bias))
+            if bidirectional:
+                self.cells.append(LSTMCell(layer_input_size, hidden_size, bias))
+    def forward(self, x, state=None):
+        seq_len = x.data.shape[1]
+        batch_size = x.data.shape[0]
+        num_directions = 2 if self.bidirectional else 1
+        if state is None:
+            h = [Tensor(np.zeros((batch_size, self.hidden_size)), requires_grad=True)
+                 for _ in range(self.num_layers * num_directions)]
+            c = [Tensor(np.zeros((batch_size, self.hidden_size)), requires_grad=True)
+                 for _ in range(self.num_layers * num_directions)]
+        else:
+            h, c = state
+        output = []
+        for t in range(seq_len):
+            x_t = Tensor(x.data[:, t, :], requires_grad=True)
+            for layer in range(self.num_layers):
+                idx = layer * num_directions
+                h[idx], c[idx] = self.cells[idx](x_t, (h[idx], c[idx]))
+                if self.bidirectional:
+                    h[idx + 1], c[idx + 1] = self.cells[idx + 1](x_t, (h[idx + 1], c[idx + 1]))
+                if self.bidirectional:
+                    x_t = Tensor(np.concatenate([h[idx].data, h[idx + 1].data], axis=1), requires_grad=True)
+                else:
+                    x_t = h[idx]
+            output.append(x_t.data)
+        output = Tensor(np.stack(output, axis=1), requires_grad=True)
+        return output, (h, c)
+class GRU(Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, bidirectional=False):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.cells = []
+        for layer in range(num_layers):
+            layer_input_size = input_size if layer == 0 else hidden_size * (2 if bidirectional else 1)
+            self.cells.append(GRUCell(layer_input_size, hidden_size, bias))
+            if bidirectional:
+                self.cells.append(GRUCell(layer_input_size, hidden_size, bias))
+    def forward(self, x, h=None):
+        seq_len = x.data.shape[1]
+        batch_size = x.data.shape[0]
+        num_directions = 2 if self.bidirectional else 1
+        if h is None:
+            h = [Tensor(np.zeros((batch_size, self.hidden_size)), requires_grad=True)
+                 for _ in range(self.num_layers * num_directions)]
+        output = []
+        for t in range(seq_len):
+            x_t = Tensor(x.data[:, t, :], requires_grad=True)
+            for layer in range(self.num_layers):
+                idx = layer * num_directions
+                h[idx] = self.cells[idx](x_t, h[idx])
+                if self.bidirectional:
+                    h[idx + 1] = self.cells[idx + 1](x_t, h[idx + 1])
+                if self.bidirectional:
+                    x_t = Tensor(np.concatenate([h[idx].data, h[idx + 1].data], axis=1), requires_grad=True)
+                else:
+                    x_t = h[idx]
+            output.append(x_t.data)
+        output = Tensor(np.stack(output, axis=1), requires_grad=True)
+        return output, h