npm - deepbox - Versions diffs - 0.1.0 - Mend

deepbox 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

package/LICENSE +21 -0
package/README.md +344 -0
package/dist/CSRMatrix-CwGwQRea.d.cts +219 -0
package/dist/CSRMatrix-KzNt6QpS.d.ts +219 -0
package/dist/Tensor-BQLk1ltW.d.cts +147 -0
package/dist/Tensor-g8mUClel.d.ts +147 -0
package/dist/chunk-4S73VUBD.js +677 -0
package/dist/chunk-4S73VUBD.js.map +1 -0
package/dist/chunk-5R4S63PF.js +2925 -0
package/dist/chunk-5R4S63PF.js.map +1 -0
package/dist/chunk-6AE5FKKQ.cjs +9264 -0
package/dist/chunk-6AE5FKKQ.cjs.map +1 -0
package/dist/chunk-AD436M45.js +3854 -0
package/dist/chunk-AD436M45.js.map +1 -0
package/dist/chunk-ALS7ETWZ.cjs +4263 -0
package/dist/chunk-ALS7ETWZ.cjs.map +1 -0
package/dist/chunk-AU7XHGKJ.js +2092 -0
package/dist/chunk-AU7XHGKJ.js.map +1 -0
package/dist/chunk-B5TNKUEY.js +1481 -0
package/dist/chunk-B5TNKUEY.js.map +1 -0
package/dist/chunk-BCR7G3A6.js +9136 -0
package/dist/chunk-BCR7G3A6.js.map +1 -0
package/dist/chunk-C4PKXY74.cjs +1917 -0
package/dist/chunk-C4PKXY74.cjs.map +1 -0
package/dist/chunk-DWZY6PIP.cjs +6400 -0
package/dist/chunk-DWZY6PIP.cjs.map +1 -0
package/dist/chunk-E3EU5FZO.cjs +2113 -0
package/dist/chunk-E3EU5FZO.cjs.map +1 -0
package/dist/chunk-F3JWBINJ.js +1054 -0
package/dist/chunk-F3JWBINJ.js.map +1 -0
package/dist/chunk-FJYLIGJX.js +1940 -0
package/dist/chunk-FJYLIGJX.js.map +1 -0
package/dist/chunk-JSCDE774.cjs +729 -0
package/dist/chunk-JSCDE774.cjs.map +1 -0
package/dist/chunk-LWECRCW2.cjs +2412 -0
package/dist/chunk-LWECRCW2.cjs.map +1 -0
package/dist/chunk-MLBMYKCG.js +6379 -0
package/dist/chunk-MLBMYKCG.js.map +1 -0
package/dist/chunk-OX6QXFMV.cjs +3874 -0
package/dist/chunk-OX6QXFMV.cjs.map +1 -0
package/dist/chunk-PHV2DKRS.cjs +1072 -0
package/dist/chunk-PHV2DKRS.cjs.map +1 -0
package/dist/chunk-PL7TAYKI.js +4056 -0
package/dist/chunk-PL7TAYKI.js.map +1 -0
package/dist/chunk-PR647I7R.js +1898 -0
package/dist/chunk-PR647I7R.js.map +1 -0
package/dist/chunk-QERHVCHC.cjs +2960 -0
package/dist/chunk-QERHVCHC.cjs.map +1 -0
package/dist/chunk-XEG44RF6.cjs +1514 -0
package/dist/chunk-XEG44RF6.cjs.map +1 -0
package/dist/chunk-XMWVME2W.js +2377 -0
package/dist/chunk-XMWVME2W.js.map +1 -0
package/dist/chunk-ZB75FESB.cjs +1979 -0
package/dist/chunk-ZB75FESB.cjs.map +1 -0
package/dist/chunk-ZLW62TJG.cjs +4061 -0
package/dist/chunk-ZLW62TJG.cjs.map +1 -0
package/dist/chunk-ZXKBDFP3.js +4235 -0
package/dist/chunk-ZXKBDFP3.js.map +1 -0
package/dist/core/index.cjs +204 -0
package/dist/core/index.cjs.map +1 -0
package/dist/core/index.d.cts +2 -0
package/dist/core/index.d.ts +2 -0
package/dist/core/index.js +3 -0
package/dist/core/index.js.map +1 -0
package/dist/dataframe/index.cjs +22 -0
package/dist/dataframe/index.cjs.map +1 -0
package/dist/dataframe/index.d.cts +3 -0
package/dist/dataframe/index.d.ts +3 -0
package/dist/dataframe/index.js +5 -0
package/dist/dataframe/index.js.map +1 -0
package/dist/datasets/index.cjs +134 -0
package/dist/datasets/index.cjs.map +1 -0
package/dist/datasets/index.d.cts +3 -0
package/dist/datasets/index.d.ts +3 -0
package/dist/datasets/index.js +5 -0
package/dist/datasets/index.js.map +1 -0
package/dist/index-74AB8Cyh.d.cts +1126 -0
package/dist/index-9oQx1HgV.d.cts +1180 -0
package/dist/index-BJY2SI4i.d.ts +483 -0
package/dist/index-BWGhrDlr.d.ts +733 -0
package/dist/index-B_DK4FKY.d.cts +242 -0
package/dist/index-BbA2Gxfl.d.ts +456 -0
package/dist/index-BgHYAoSS.d.cts +837 -0
package/dist/index-BndMbqsM.d.ts +1439 -0
package/dist/index-C1mfVYoo.d.ts +2517 -0
package/dist/index-CCvlwAmL.d.cts +809 -0
package/dist/index-CDw5CnOU.d.ts +785 -0
package/dist/index-Cn3SdB0O.d.ts +1126 -0
package/dist/index-CrqLlS-a.d.ts +776 -0
package/dist/index-D61yaSMY.d.cts +483 -0
package/dist/index-D9Loo1_A.d.cts +2517 -0
package/dist/index-DIT_OO9C.d.cts +785 -0
package/dist/index-DIp_RrRt.d.ts +242 -0
package/dist/index-DbultU6X.d.cts +1427 -0
package/dist/index-DmEg_LCm.d.cts +776 -0
package/dist/index-DoPWVxPo.d.cts +1439 -0
package/dist/index-DuCxd-8d.d.ts +837 -0
package/dist/index-Dx42TZaY.d.ts +809 -0
package/dist/index-DyZ4QQf5.d.cts +456 -0
package/dist/index-GFAVyOWO.d.ts +1427 -0
package/dist/index-WHQLn0e8.d.cts +733 -0
package/dist/index-ZtI1Iy4L.d.ts +1180 -0
package/dist/index-eJgeni9c.d.cts +1911 -0
package/dist/index-tk4lSYod.d.ts +1911 -0
package/dist/index.cjs +72 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +17 -0
package/dist/index.d.ts +17 -0
package/dist/index.js +15 -0
package/dist/index.js.map +1 -0
package/dist/linalg/index.cjs +86 -0
package/dist/linalg/index.cjs.map +1 -0
package/dist/linalg/index.d.cts +3 -0
package/dist/linalg/index.d.ts +3 -0
package/dist/linalg/index.js +5 -0
package/dist/linalg/index.js.map +1 -0
package/dist/metrics/index.cjs +158 -0
package/dist/metrics/index.cjs.map +1 -0
package/dist/metrics/index.d.cts +3 -0
package/dist/metrics/index.d.ts +3 -0
package/dist/metrics/index.js +5 -0
package/dist/metrics/index.js.map +1 -0
package/dist/ml/index.cjs +87 -0
package/dist/ml/index.cjs.map +1 -0
package/dist/ml/index.d.cts +3 -0
package/dist/ml/index.d.ts +3 -0
package/dist/ml/index.js +6 -0
package/dist/ml/index.js.map +1 -0
package/dist/ndarray/index.cjs +501 -0
package/dist/ndarray/index.cjs.map +1 -0
package/dist/ndarray/index.d.cts +5 -0
package/dist/ndarray/index.d.ts +5 -0
package/dist/ndarray/index.js +4 -0
package/dist/ndarray/index.js.map +1 -0
package/dist/nn/index.cjs +142 -0
package/dist/nn/index.cjs.map +1 -0
package/dist/nn/index.d.cts +6 -0
package/dist/nn/index.d.ts +6 -0
package/dist/nn/index.js +5 -0
package/dist/nn/index.js.map +1 -0
package/dist/optim/index.cjs +77 -0
package/dist/optim/index.cjs.map +1 -0
package/dist/optim/index.d.cts +4 -0
package/dist/optim/index.d.ts +4 -0
package/dist/optim/index.js +4 -0
package/dist/optim/index.js.map +1 -0
package/dist/plot/index.cjs +114 -0
package/dist/plot/index.cjs.map +1 -0
package/dist/plot/index.d.cts +6 -0
package/dist/plot/index.d.ts +6 -0
package/dist/plot/index.js +5 -0
package/dist/plot/index.js.map +1 -0
package/dist/preprocess/index.cjs +82 -0
package/dist/preprocess/index.cjs.map +1 -0
package/dist/preprocess/index.d.cts +4 -0
package/dist/preprocess/index.d.ts +4 -0
package/dist/preprocess/index.js +5 -0
package/dist/preprocess/index.js.map +1 -0
package/dist/random/index.cjs +74 -0
package/dist/random/index.cjs.map +1 -0
package/dist/random/index.d.cts +3 -0
package/dist/random/index.d.ts +3 -0
package/dist/random/index.js +5 -0
package/dist/random/index.js.map +1 -0
package/dist/stats/index.cjs +142 -0
package/dist/stats/index.cjs.map +1 -0
package/dist/stats/index.d.cts +3 -0
package/dist/stats/index.d.ts +3 -0
package/dist/stats/index.js +5 -0
package/dist/stats/index.js.map +1 -0
package/dist/tensor-B96jjJLQ.d.cts +205 -0
package/dist/tensor-B96jjJLQ.d.ts +205 -0
package/package.json +226 -0

package/dist/index-DoPWVxPo.d.cts ADDED Viewed

@@ -0,0 +1,1439 @@
+import { A as AnyTensor } from './index-eJgeni9c.cjs';
+import { D as DType, a as Device, A as Axis } from './tensor-B96jjJLQ.cjs';
+import { G as GradTensor } from './index-B_DK4FKY.cjs';
+import { T as Tensor } from './Tensor-BQLk1ltW.cjs';
+type StateEntry = {
+    data: Array<number | string | bigint>;
+    dtype: DType;
+    shape: number[];
+};
+/**
+ * Hook function called before the forward pass.
+ *
+ * @param module - The module being called
+ * @param inputs - The input tensors to the forward pass
+ * @returns Modified inputs array, or undefined to keep original inputs
+ */
+type ForwardPreHook = (module: Module, inputs: AnyTensor[]) => AnyTensor[] | undefined;
+/**
+ * Hook function called after the forward pass.
+ *
+ * @param module - The module being called
+ * @param inputs - The input tensors to the forward pass
+ * @param output - The output tensor from the forward pass
+ * @returns Modified output tensor, or undefined to keep original output
+ */
+type ForwardHook = (module: Module, inputs: AnyTensor[], output: AnyTensor) => AnyTensor | undefined;
+/**
+ * Base class for all neural network modules.
+ *
+ * All models should subclass this class. Modules can contain other modules,
+ * allowing to nest them in a tree structure.
+ *
+ * This is analogous to PyTorch's nn.Module.
+ *
+ * @example
+ * ```ts
+ * import { Module, Linear, ReLU } from 'deepbox/nn';
+ * import type { Tensor } from 'deepbox/ndarray';
+ *
+ * class MyModel extends Module {
+ *   private fc1: Linear;
+ *   private relu: ReLU;
+ *   private fc2: Linear;
+ *
+ *   constructor() {
+ *     super();
+ *     this.fc1 = new Linear(10, 5);
+ *     this.relu = new ReLU();
+ *     this.fc2 = new Linear(5, 2);
+ *     this.registerModule('fc1', this.fc1);
+ *     this.registerModule('relu', this.relu);
+ *     this.registerModule('fc2', this.fc2);
+ *   }
+ *
+ *   forward(x: Tensor): Tensor {
+ *     let out = this.fc1.forward(x);
+ *     out = this.relu.forward(out);
+ *     out = this.fc2.forward(out);
+ *     return out;
+ *   }
+ * }
+ * ```
+ *
+ * References:
+ * - PyTorch nn.Module: https://pytorch.org/docs/stable/generated/torch.nn.Module.html
+ *
+ * @category Neural Networks
+ */
+declare abstract class Module {
+    /** Child modules registered to this module - stores nested layers/modules */
+    protected _modules: Map<string, Module>;
+    /** Parameters of this module - trainable tensors (weights, biases) wrapped as GradTensor */
+    protected _parameters: Map<string, GradTensor>;
+    /** Buffers (non-trainable tensors) of this module - e.g., running stats in BatchNorm */
+    protected _buffers: Map<string, Tensor>;
+    /** Training mode flag - affects behavior of layers like Dropout and BatchNorm */
+    protected _training: boolean;
+    /** Forward pre-hooks registered on this module */
+    private _forwardPreHooks;
+    /** Forward hooks registered on this module */
+    private _forwardHooks;
+    /** Incrementing hook id */
+    private _nextHookId;
+    /**
+     * Forward pass of the module.
+     *
+     * Should be overridden by all subclasses. Accepts either regular Tensors
+     * or GradTensors for automatic differentiation support.
+     *
+     * @param inputs - Input tensors (Tensor or GradTensor)
+     * @returns Output tensor (Tensor or GradTensor depending on input and layer type)
+     *
+     * @example
+     * ```ts
+     * // Using with regular Tensor
+     * const output = model.forward(inputTensor);
+     *
+     * // Using with GradTensor for training
+     * const gradOutput = model.forward(gradInput);
+     * gradOutput.backward();
+     * ```
+     */
+    abstract forward(...inputs: AnyTensor[]): AnyTensor;
+    /**
+     * Makes the module callable (allows using `module(x)` instead of `module.forward(x)`).
+     *
+     * @param inputs - Input tensors (Tensor or GradTensor)
+     * @returns Output tensor
+     */
+    call(...inputs: AnyTensor[]): AnyTensor;
+    /**
+     * Register a child module.
+     *
+     * @param name - Name of the module
+     * @param module - The module to register
+     */
+    protected registerModule(name: string, module: Module): void;
+    /**
+     * Register a parameter (trainable tensor).
+     *
+     * Parameters must be GradTensor instances with requiresGrad=true for
+     * proper gradient computation during backpropagation.
+     *
+     * @param name - Name of the parameter
+     * @param param - The parameter tensor (must be GradTensor)
+     */
+    protected registerParameter(name: string, param: GradTensor): void;
+    /**
+     * Register a buffer (non-trainable tensor).
+     *
+     * Buffers are typically used for running statistics in batch normalization.
+     *
+     * @param name - Name of the buffer
+     * @param buffer - The buffer tensor
+     */
+    protected registerBuffer(name: string, buffer: Tensor): void;
+    /**
+     * Get all parameters of this module and its children.
+     *
+     * Returns GradTensor instances that are compatible with optimizers.
+     * This enables direct usage with optimizer constructors:
+     * ```ts
+     * const optimizer = new Adam(model.parameters());
+     * ```
+     *
+     * @param recurse - Whether to include parameters of child modules
+     * @returns Iterator of GradTensor parameters
+     */
+    parameters(recurse?: boolean): Generator<GradTensor>;
+    /**
+     * Get all named parameters of this module and its children.
+     *
+     * @param prefix - Prefix for parameter names
+     * @param recurse - Whether to include parameters of child modules
+     * @returns Iterator of [name, parameter] pairs
+     */
+    namedParameters(prefix?: string, recurse?: boolean): Generator<[string, GradTensor]>;
+    /**
+     * Get all child modules.
+     *
+     * @param recurse - Whether to include nested child modules
+     * @returns Iterator of modules
+     */
+    modules(recurse?: boolean): Generator<Module>;
+    /**
+     * Get all named child modules.
+     *
+     * @param prefix - Prefix for module names
+     * @param recurse - Whether to include nested child modules
+     * @returns Iterator of [name, module] pairs
+     */
+    namedModules(prefix?: string, recurse?: boolean): Generator<[string, Module]>;
+    /**
+     * Set the module in training mode.
+     *
+     * This affects certain layers like Dropout and BatchNorm.
+     *
+     * @param mode - Training mode (true) or evaluation mode (false)
+     * @returns this
+     */
+    train(mode?: boolean): this;
+    /**
+     * Set the module in evaluation mode.
+     *
+     * This is equivalent to calling `train(false)`.
+     *
+     * @returns this
+     */
+    eval(): this;
+    /**
+     * Check if the module is in training mode.
+     *
+     * @returns true if in training mode
+     */
+    get training(): boolean;
+    /**
+     * Zero out the gradients of all parameters.
+     *
+     * Call this before each training iteration to prevent gradient accumulation
+     * from previous iterations.
+     *
+     * For parameters wrapped in GradTensor, this calls zeroGrad() on each.
+     * For regular Tensors, this is a no-op until they are converted to GradTensor.
+     *
+     * @example
+     * ```ts
+     * model.zeroGrad();
+     * const output = model.forward(input);
+     * // ... compute loss and backward
+     * optimizer.step();
+     * ```
+     */
+    zeroGrad(): void;
+    /**
+     * Get all buffers of this module and its children.
+     */
+    buffers(recurse?: boolean): Generator<Tensor>;
+    /**
+     * Get all named buffers of this module and its children.
+     */
+    namedBuffers(prefix?: string, recurse?: boolean): Generator<[string, Tensor]>;
+    /**
+     * Freeze specific parameters by name (or all if none provided).
+     *
+     * **⚠️ IMPORTANT**: This method creates new GradTensor instances with updated
+     * `requiresGrad` flags. Any external references to the old parameter objects
+     * will become stale. If you're using an optimizer that holds parameter references,
+     * you should recreate the optimizer after freezing/unfreezing parameters.
+     *
+     * @param names - Array of parameter names to freeze (e.g., ['fc1.weight']). If undefined, freezes all parameters.
+     * @param recurse - Whether to include parameters from child modules (default: true)
+     *
+     * @example
+     * ```ts
+     * const model = new MyModel();
+     * // Freeze only the first layer's weights
+     * model.freezeParameters(['fc1.weight']);
+     * // Note: Recreate optimizer after freezing
+     * const optimizer = new Adam(model.parameters());
+     * ```
+     */
+    freezeParameters(names?: string[], recurse?: boolean): void;
+    /**
+     * Unfreeze specific parameters by name (or all if none provided).
+     *
+     * **⚠️ IMPORTANT**: This method creates new GradTensor instances with updated
+     * `requiresGrad` flags. Any external references to the old parameter objects
+     * will become stale. If you're using an optimizer that holds parameter references,
+     * you should recreate the optimizer after freezing/unfreezing parameters.
+     *
+     * @param names - Array of parameter names to unfreeze (e.g., ['fc1.weight']). If undefined, unfreezes all parameters.
+     * @param recurse - Whether to include parameters from child modules (default: true)
+     *
+     * @example
+     * ```ts
+     * const model = new MyModel();
+     * model.freezeParameters(); // Freeze all
+     * model.unfreezeParameters(['fc2.weight']); // Unfreeze only fc2 weights
+     * // Note: Recreate optimizer after unfreezing
+     * const optimizer = new Adam(model.parameters());
+     * ```
+     */
+    unfreezeParameters(names?: string[], recurse?: boolean): void;
+    private setRequiresGradForNames;
+    private resolveModuleAndName;
+    private static setTensorDeviceMetadata;
+    /**
+     * Get the state dictionary of the module.
+     */
+    stateDict(): {
+        parameters: Record<string, StateEntry>;
+        buffers: Record<string, StateEntry>;
+    };
+    /**
+     * Load state dictionary into the module.
+     */
+    loadStateDict(stateDict: {
+        parameters?: Record<string, StateEntry>;
+        buffers?: Record<string, StateEntry>;
+    }): void;
+    /**
+     * Move module to a specific device.
+     *
+     * **⚠️ WARNING**: This is a metadata-only operation. It updates the device
+     * property on parameters and buffers but does NOT actually transfer data
+     * between devices. Actual device data transfer requires device-specific
+     * memory management which is not yet implemented.
+     *
+     * This method is provided for API compatibility and future extensibility.
+     * Currently, it only updates the `device` metadata field.
+     *
+     * @param device - Target device identifier (e.g., 'cpu', 'webgpu', 'wasm')
+     * @returns this module for method chaining
+     *
+     * @example
+     * ```ts
+     * const model = new Linear(10, 5);
+     * model.to('webgpu'); // Updates device metadata only
+     * ```
+     */
+    to(device: Device): this;
+    /**
+     * Apply a function to all modules recursively.
+     */
+    apply(fn: (module: Module) => void): this;
+    /**
+     * Register a forward pre-hook.
+     */
+    registerForwardPreHook(hook: ForwardPreHook): () => void;
+    /**
+     * Register a forward hook.
+     */
+    registerForwardHook(hook: ForwardHook): () => void;
+    /**
+     * Get string representation of the module.
+     *
+     * @returns Hierarchical string representation showing module structure
+     */
+    toString(): string;
+}
+/**
+ * Sequential container for stacking layers in a linear pipeline.
+ *
+ * **Purpose:**
+ * - Simplifies model construction by chaining layers sequentially
+ * - Automatically manages forward pass through all layers
+ * - Provides clean API for building feedforward networks
+ *
+ * **Behavior:**
+ * The output of each layer becomes the input to the next layer.
+ * Layers are executed in the order they were added.
+ *
+ * @example
+ * ```ts
+ * import { Sequential, Linear, ReLU, Dropout } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * // Create a simple feedforward network
+ * const model = new Sequential(
+ *   new Linear(784, 256),
+ *   new ReLU(),
+ *   new Dropout(0.5),
+ *   new Linear(256, 10)
+ * );
+ *
+ * const input = tensor(new Array(784).fill(0));
+ * const output = model.forward(input);
+ * ```
+ *
+ * @example
+ * ```ts
+ * // Access individual layers
+ * const model = new Sequential(
+ *   new Linear(10, 5),
+ *   new ReLU()
+ * );
+ *
+ * const firstLayer = model.getLayer(0); // Linear layer
+ * const layerCount = model.length; // 2
+ * ```
+ *
+ * References:
+ * - PyTorch Sequential: https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html
+ * - Keras Sequential: https://keras.io/guides/sequential_model/
+ *
+ * @category Neural Network Containers
+ */
+declare class Sequential extends Module {
+    /** Array of layers in sequential order */
+    private readonly layers;
+    /**
+     * Create a new Sequential container.
+     *
+     * @param layers - Variable number of Module instances to stack sequentially
+     * @throws {InvalidParameterError} If no layers are provided
+     * @throws {DeepboxError} If a layer is undefined
+     */
+    constructor(...layers: Module[]);
+    /**
+     * Forward pass: sequentially apply all layers.
+     *
+     * The output of each layer becomes the input to the next layer.
+     *
+     * @param input - Input tensor (Tensor or GradTensor)
+     * @returns Output tensor after passing through all layers
+     * @throws {InvalidParameterError} If the input count is invalid or a layer returns multiple outputs
+     * @throws {DeepboxError} If a layer is undefined
+     */
+    forward(...inputs: AnyTensor[]): AnyTensor;
+    /**
+     * Get a layer by index.
+     *
+     * @param index - Zero-based index of the layer
+     * @returns The layer at the specified index
+     * @throws {IndexError} If index is out of bounds
+     * @throws {DeepboxError} If a layer is undefined
+     */
+    getLayer(index: number): Module;
+    /**
+     * Get the number of layers in the sequential container.
+     */
+    get length(): number;
+    /**
+     * Get string representation showing all layers.
+     *
+     * @returns Multi-line string with each layer on a separate line
+     */
+    toString(): string;
+    /**
+     * Iterate over all layers.
+     *
+     * @returns Iterator of layers
+     */
+    [Symbol.iterator](): Iterator<Module>;
+}
+/**
+ * Applies the Rectified Linear Unit (ReLU) activation function element-wise.
+ *
+ * ReLU(x) = max(0, x)
+ *
+ * @category Neural Network Layers
+ */
+declare class ReLU extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Sigmoid activation function element-wise.
+ *
+ * Sigmoid(x) = 1 / (1 + exp(-x))
+ *
+ * @category Neural Network Layers
+ */
+declare class Sigmoid extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Hyperbolic Tangent (Tanh) activation function element-wise.
+ *
+ * Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+ *
+ * @category Neural Network Layers
+ */
+declare class Tanh extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Leaky Rectified Linear Unit (Leaky ReLU) activation.
+ *
+ * LeakyReLU(x) = max(alpha * x, x)
+ *
+ * @category Neural Network Layers
+ */
+declare class LeakyReLU extends Module {
+    private readonly alpha;
+    constructor(alpha?: number);
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Exponential Linear Unit (ELU) activation.
+ *
+ * ELU(x) = x if x > 0, else alpha * (exp(x) - 1)
+ *
+ * @category Neural Network Layers
+ */
+declare class ELU extends Module {
+    private readonly alpha;
+    constructor(alpha?: number);
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Gaussian Error Linear Unit (GELU) activation.
+ *
+ * GELU(x) = x * Phi(x) where Phi is the CDF of standard normal distribution
+ *
+ * @category Neural Network Layers
+ */
+declare class GELU extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Softmax activation function.
+ *
+ * Softmax(x_i) = exp(x_i) / sum(exp(x_j))
+ *
+ * @category Neural Network Layers
+ */
+declare class Softmax extends Module {
+    private readonly axis;
+    constructor(axis?: Axis);
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Log Softmax activation function.
+ *
+ * LogSoftmax(x_i) = log(exp(x_i) / sum(exp(x_j)))
+ *
+ * @category Neural Network Layers
+ */
+declare class LogSoftmax extends Module {
+    private readonly axis;
+    constructor(axis?: Axis);
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Softplus activation function.
+ *
+ * Softplus(x) = log(1 + exp(x))
+ *
+ * @category Neural Network Layers
+ */
+declare class Softplus extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Swish activation function (also known as SiLU).
+ *
+ * Swish(x) = x * sigmoid(x)
+ *
+ * @category Neural Network Layers
+ */
+declare class Swish extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Applies the Mish activation function.
+ *
+ * Mish(x) = x * tanh(softplus(x))
+ *
+ * @category Neural Network Layers
+ */
+declare class Mish extends Module {
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    toString(): string;
+}
+/**
+ * Multi-Head Attention mechanism.
+ *
+ * Allows the model to jointly attend to information from different representation
+ * subspaces at different positions. This is the core building block of Transformers.
+ *
+ * **Mathematical Formulation**:
+ * ```
+ * Attention(Q, K, V) = softmax(Q * K^T / sqrt(d_k)) * V
+ * MultiHead(Q, K, V) = Concat(head_1, ..., head_h) * W_O
+ * where head_i = Attention(Q * W_Q^i, K * W_K^i, V * W_V^i)
+ * ```
+ *
+ * @example
+ * ```ts
+ * import { MultiheadAttention } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const mha = new MultiheadAttention(512, 8);
+ * const x = tensor([[/* ... sequence data ... *\/]]);
+ * const output = mha.forward(x, x, x);
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html | PyTorch MultiheadAttention}
+ * @see Vaswani et al. (2017) "Attention Is All You Need"
+ */
+declare class MultiheadAttention extends Module {
+    /** Embedding dimension */
+    private readonly embedDim;
+    /** Number of attention heads */
+    private readonly numHeads;
+    /** Dimension of each head */
+    private readonly headDim;
+    /** Scaling factor for dot product attention */
+    private readonly scale;
+    /** Whether to add bias to projections */
+    private readonly useBias;
+    /** Dropout probability applied to attention weights */
+    private readonly dropout;
+    /** Query projection weights (embedDim, embedDim) */
+    private wQ;
+    private bQ?;
+    /** Key projection weights (embedDim, embedDim) */
+    private wK;
+    private bK?;
+    /** Value projection weights (embedDim, embedDim) */
+    private wV;
+    private bV?;
+    /** Output projection weights (embedDim, embedDim) */
+    private wO;
+    private bO?;
+    /**
+     * Create a new MultiheadAttention layer.
+     *
+     * @param embedDim - Total dimension of the model (must be divisible by numHeads)
+     * @param numHeads - Number of parallel attention heads
+     * @param options - Configuration options
+     * @param options.bias - Whether to add bias to projections (default: true)
+     * @param options.dropout - Dropout probability applied to attention weights (default: 0.0)
+     */
+    constructor(embedDim: number, numHeads: number, options?: {
+        readonly bias?: boolean;
+        readonly dropout?: number;
+    });
+    /**
+     * Forward pass of multi-head attention.
+     *
+     * @param query - Query tensor of shape (batch, seqLen, embedDim)
+     * @param key - Key tensor of shape (batch, seqLen, embedDim)
+     * @param value - Value tensor of shape (batch, seqLen, embedDim)
+     * @returns Output tensor of same shape as query
+     */
+    forward(...inputs: AnyTensor[]): GradTensor;
+    toString(): string;
+}
+/**
+ * Transformer Encoder Layer.
+ *
+ * A single layer of the Transformer encoder, consisting of:
+ * 1. Multi-head self-attention
+ * 2. Add & Norm (residual connection + layer normalization)
+ * 3. Feed-forward network (FFN)
+ * 4. Add & Norm
+ *
+ * @example
+ * ```ts
+ * import { TransformerEncoderLayer } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const layer = new TransformerEncoderLayer(512, 8, 2048);
+ * const x = tensor([[/* sequence data *\/]]);
+ * const output = layer.forward(x);
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html | PyTorch TransformerEncoderLayer}
+ */
+declare class TransformerEncoderLayer extends Module {
+    private readonly dModel;
+    private readonly nHead;
+    private readonly dFF;
+    private readonly selfAttn;
+    private readonly linear1;
+    private readonly linear2;
+    private readonly norm1;
+    private readonly norm2;
+    private readonly dropout;
+    private readonly dropout1;
+    private readonly dropout2;
+    private readonly dropout3;
+    constructor(dModel: number, nHead: number, dFF: number, options?: {
+        readonly dropout?: number;
+        readonly eps?: number;
+    });
+    /**
+     * Forward pass of the Transformer encoder layer.
+     *
+     * @param src - Source sequence of shape (batch, seqLen, dModel)
+     * @returns Output of same shape as input
+     */
+    forward(src: AnyTensor): GradTensor;
+    toString(): string;
+}
+/**
+ * 1D Convolutional Layer.
+ *
+ * Applies a 1D convolution over an input signal composed of several input planes.
+ *
+ * @example
+ * ```ts
+ * import { Conv1d } from 'deepbox/nn';
+ *
+ * const conv = new Conv1d(16, 33, 3); // in_channels=16, out_channels=33, kernel_size=3
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html | PyTorch Conv1d}
+ */
+declare class Conv1d extends Module {
+    private readonly inChannels;
+    private readonly outChannels;
+    private readonly kernelSize;
+    private readonly stride;
+    private readonly padding;
+    private readonly bias;
+    private weight_?;
+    private bias_?;
+    constructor(inChannels: number, outChannels: number, kernelSize: number, options?: {
+        readonly stride?: number;
+        readonly padding?: number;
+        readonly bias?: boolean;
+    });
+    private initializeParameters;
+    forward(x: AnyTensor): GradTensor;
+    get weight(): GradTensor;
+}
+/**
+ * 2D Convolutional Layer.
+ *
+ * Applies a 2D convolution over an input signal composed of several input planes.
+ *
+ * @example
+ * ```ts
+ * import { Conv2d } from 'deepbox/nn';
+ *
+ * const conv = new Conv2d(3, 64, 3); // RGB to 64 channels, 3x3 kernel
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html | PyTorch Conv2d}
+ */
+declare class Conv2d extends Module {
+    private readonly inChannels;
+    private readonly outChannels;
+    private readonly kernelSize;
+    private readonly stride;
+    private readonly padding;
+    private readonly useBias;
+    private weight_?;
+    private bias_?;
+    constructor(inChannels: number, outChannels: number, kernelSize: number | [number, number], options?: {
+        readonly stride?: number | [number, number];
+        readonly padding?: number | [number, number];
+        readonly bias?: boolean;
+    });
+    private initializeParameters;
+    forward(x: AnyTensor): GradTensor;
+    get weight(): GradTensor;
+}
+/**
+ * 2D Max Pooling Layer.
+ *
+ * Applies a 2D max pooling over an input signal.
+ *
+ * @example
+ * ```ts
+ * import { MaxPool2d } from 'deepbox/nn';
+ *
+ * const pool = new MaxPool2d(2); // 2x2 pooling
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html | PyTorch MaxPool2d}
+ */
+declare class MaxPool2d extends Module {
+    private readonly kernelSizeValue;
+    private readonly stride;
+    private readonly padding;
+    constructor(kernelSize: number | [number, number], options?: {
+        readonly stride?: number | [number, number];
+        readonly padding?: number | [number, number];
+    });
+    forward(x: AnyTensor): GradTensor;
+}
+/**
+ * 2D Average Pooling Layer.
+ *
+ * Applies a 2D average pooling over an input signal.
+ *
+ * @example
+ * ```ts
+ * import { AvgPool2d } from 'deepbox/nn';
+ *
+ * const pool = new AvgPool2d(2); // 2x2 pooling
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html | PyTorch AvgPool2d}
+ */
+declare class AvgPool2d extends Module {
+    private readonly kernelSizeValue;
+    private readonly stride;
+    private readonly padding;
+    constructor(kernelSize: number | [number, number], options?: {
+        readonly stride?: number | [number, number];
+        readonly padding?: number | [number, number];
+    });
+    forward(x: AnyTensor): GradTensor;
+}
+/**
+ * Applies Dropout regularization during training.
+ *
+ * **Mathematical Formulation:**
+ * During training:
+ * ```
+ * y = x * mask / (1 - p)
+ * ```
+ * where mask is a binary tensor with probability (1-p) of being 1.
+ *
+ * During evaluation:
+ * ```
+ * y = x
+ * ```
+ *
+ * **Purpose:**
+ * - Prevents overfitting by randomly zeroing elements during training
+ * - Forces network to learn redundant representations
+ * - Improves generalization performance
+ *
+ * **Scaling:**
+ * The output is scaled by 1/(1-p) during training to maintain expected value.
+ * This is called "inverted dropout" and eliminates the need for scaling during inference.
+ *
+ * @example
+ * ```ts
+ * import { Dropout } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const dropout = new Dropout(0.5); // Drop 50% of neurons
+ * const input = tensor([[1, 2, 3, 4]]);
+ *
+ * // Training mode: randomly zeros ~50% of elements
+ * dropout.train();
+ * const output = dropout.forward(input);
+ *
+ * // Evaluation mode: passes input unchanged
+ * dropout.eval();
+ * const output2 = dropout.forward(input); // Same as input
+ * ```
+ *
+ * References:
+ * - Dropout paper: https://jmlr.org/papers/v15/srivastava14a.html
+ * - PyTorch Dropout: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+ *
+ * @category Neural Network Layers
+ */
+declare class Dropout extends Module {
+    /** Probability of an element being zeroed (dropout rate) */
+    private readonly p;
+    /**
+     * Create a new Dropout layer.
+     *
+     * @param p - Probability of an element being zeroed (0 <= p < 1)
+     * @throws {InvalidParameterError} If p is not in valid range [0, 1)
+     */
+    constructor(p?: number);
+    /**
+     * Forward pass: apply dropout during training, identity during evaluation.
+     *
+     * @param input - Input tensor of any shape (Tensor or GradTensor)
+     * @returns Output tensor with same shape as input
+     */
+    forward(input: AnyTensor): GradTensor;
+    /**
+     * Get string representation of the layer.
+     *
+     * @returns String representation with dropout probability
+     */
+    toString(): string;
+    /**
+     * Get the dropout probability.
+     */
+    get dropoutRate(): number;
+}
+/**
+ * Applies a linear transformation to the incoming data: y = xA^T + b
+ *
+ * This is also known as a fully connected layer or dense layer.
+ *
+ * **Mathematical Formulation:**
+ * ```
+ * y = x * W^T + b
+ * ```
+ *
+ * Where:
+ * - x is the input tensor of shape (*, in_features)
+ * - W is the weight matrix of shape (out_features, in_features)
+ * - b is the bias vector of shape (out_features,)
+ * - y is the output tensor of shape (*, out_features)
+ *
+ * **Shape Conventions:**
+ * - Input: `(*, in_features)` where `*` means any number of leading dimensions
+ *   - 1D: `(in_features)` → Output: `(out_features)`
+ *   - 2D: `(batch, in_features)` → Output: `(batch, out_features)`
+ *   - 3D: `(batch, seq_len, in_features)` → Output: `(batch, seq_len, out_features)`
+ * - The last dimension must equal `in_features`
+ * - All leading dimensions are preserved in the output
+ *
+ * **Parameters:**
+ * - `inFeatures`: Size of each input sample
+ * - `outFeatures`: Size of each output sample
+ * - `bias`: If true, adds a learnable bias to the output
+ *
+ * **Attributes:**
+ * - `weight`: Learnable weights of shape (out_features, in_features)
+ * - `bias`: Learnable bias of shape (out_features,) if bias=true
+ *
+ * **Initialization:**
+ * Uses Kaiming/He initialization: weights ~ N(0, sqrt(2/in_features))
+ * Biases are initialized to zeros
+ *
+ * @example
+ * ```ts
+ * import { Linear } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * // Create a linear layer with 20 input features and 30 output features
+ * const layer = new Linear(20, 30);
+ *
+ * // Forward pass
+ * const input = tensor([[1, 2, ..., 20]]); // shape: (1, 20)
+ * const output = layer.forward(input);     // shape: (1, 30)
+ *
+ * // Without bias
+ * const layerNoBias = new Linear(10, 5, { bias: false });
+ * ```
+ *
+ * References:
+ * - PyTorch Linear: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+ * - Xavier/Glorot initialization: http://proceedings.mlr.press/v9/glorot10a.html
+ *
+ * @category Neural Network Layers
+ */
+declare class Linear extends Module {
+    /** Weight matrix of shape (out_features, in_features) */
+    private weight;
+    private weightParam;
+    /** Bias vector of shape (out_features,) */
+    private bias?;
+    private biasParam?;
+    /** Number of input features */
+    private readonly inFeatures;
+    /** Number of output features */
+    private readonly outFeatures;
+    /** Whether this layer has a bias */
+    private readonly useBias;
+    /**
+     * Create a new Linear layer.
+     *
+     * @param inFeatures - Size of each input sample
+     * @param outFeatures - Size of each output sample
+     * @param options - Configuration options
+     * @param options.bias - If true, add learnable bias (default: true)
+     * @param options.dtype - Data type for weights (default: 'float32')
+     * @param options.device - Device to place tensors on (default: 'cpu')
+     */
+    constructor(inFeatures: number, outFeatures: number, options?: {
+        readonly bias?: boolean;
+        readonly dtype?: "float32" | "float64";
+        readonly device?: "cpu" | "webgpu" | "wasm";
+    });
+    /**
+     * Forward pass: compute y = x * W^T + b
+     *
+     * @param input - Input tensor of shape (*, in_features)
+     * @returns Output tensor of shape (*, out_features)
+     * @throws {ShapeError} If input shape is invalid
+     * @throws {DTypeError} If input dtype is unsupported
+     */
+    forward(input: GradTensor): GradTensor;
+    forward(input: Tensor): Tensor;
+    /**
+     * Get extra representation string for this layer.
+     *
+     * @returns String representation of layer parameters
+     */
+    toString(): string;
+    /**
+     * Get the weight matrix.
+     *
+     * @returns Weight tensor of shape (out_features, in_features)
+     */
+    getWeight(): Tensor;
+    /**
+     * Get the bias vector.
+     *
+     * @returns Bias tensor of shape (out_features,) or undefined if no bias
+     */
+    getBias(): Tensor | undefined;
+    /**
+     * Get the number of input features.
+     */
+    get inputSize(): number;
+    /**
+     * Get the number of output features.
+     */
+    get outputSize(): number;
+}
+/**
+ * Batch Normalization layer.
+ *
+ * Normalizes the input over the batch dimension for faster and more stable training.
+ *
+ * **Formula**: y = (x - E[x]) / sqrt(Var[x] + eps) * gamma + beta
+ *
+ * During training, uses batch statistics. During evaluation, uses running statistics
+ * unless `trackRunningStats=false`, in which case batch statistics are always used.
+ *
+ * @example
+ * ```ts
+ * import { BatchNorm1d } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const bn = new BatchNorm1d(10);
+ * const x = tensor([[1, 2, 3], [4, 5, 6]]);
+ * const y = bn.forward(x);
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html | PyTorch BatchNorm1d}
+ */
+declare class BatchNorm1d extends Module {
+    private readonly numFeatures;
+    private readonly eps;
+    private readonly momentum;
+    private readonly affine;
+    private readonly trackRunningStats;
+    private gamma?;
+    private beta?;
+    private runningMean;
+    private runningVar;
+    constructor(numFeatures: number, options?: {
+        readonly eps?: number;
+        readonly momentum?: number;
+        readonly affine?: boolean;
+        readonly trackRunningStats?: boolean;
+    });
+    forward(x: AnyTensor): GradTensor;
+    toString(): string;
+}
+/**
+ * Layer Normalization.
+ *
+ * Normalizes across the feature dimensions (trailing dimensions specified by `normalizedShape`)
+ * for each sample independently. Unlike BatchNorm, LayerNorm works the same way during training
+ * and evaluation.
+ *
+ * **Formula**: y = (x - E[x]) / sqrt(Var[x] + eps) * gamma + beta
+ *
+ * @example
+ * ```ts
+ * import { LayerNorm } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const ln = new LayerNorm([10]);
+ * const x = tensor([[1, 2, 3]]);
+ * const y = ln.forward(x);
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html | PyTorch LayerNorm}
+ */
+declare class LayerNorm extends Module {
+    private readonly normalizedShape;
+    private readonly eps;
+    private readonly elementwiseAffine;
+    private gamma?;
+    private beta?;
+    constructor(normalizedShape: number | readonly number[], options?: {
+        readonly eps?: number;
+        readonly elementwiseAffine?: boolean;
+    });
+    forward(x: AnyTensor): GradTensor;
+    toString(): string;
+}
+/**
+ * Simple RNN layer.
+ *
+ * Applies a simple recurrent neural network to an input sequence.
+ *
+ * **Formula**: h_t = tanh(W_ih * x_t + b_ih + W_hh * h_{t-1} + b_hh)
+ *
+ * @example
+ * ```ts
+ * import { RNN } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const rnn = new RNN(10, 20);
+ * const x = tensor([[[1, 2, 3]]]);  // (batch, seq_len, input_size)
+ * const output = rnn.forward(x);
+ * ```
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.RNN.html | PyTorch RNN}
+ */
+declare class RNN extends Module {
+    private readonly inputSize;
+    private readonly hiddenSize;
+    private readonly numLayers;
+    private readonly nonlinearity;
+    private readonly bias;
+    private readonly batchFirst;
+    private weightsIh;
+    private weightsHh;
+    private biasIh;
+    private biasHh;
+    constructor(inputSize: number, hiddenSize: number, options?: {
+        readonly numLayers?: number;
+        readonly nonlinearity?: "tanh" | "relu";
+        readonly bias?: boolean;
+        readonly batchFirst?: boolean;
+    });
+    private activation;
+    private run;
+    forward(...inputs: AnyTensor[]): Tensor;
+    /**
+     * Forward pass returning both output and hidden state.
+     * Use this method when you need the hidden state.
+     */
+    forwardWithState(input: AnyTensor, hx?: AnyTensor): [Tensor, Tensor];
+    toString(): string;
+}
+/**
+ * LSTM (Long Short-Term Memory) layer.
+ *
+ * Applies a multi-layer LSTM to an input sequence.
+ *
+ * **Gates**:
+ * - Input gate: i_t = σ(W_ii * x_t + b_ii + W_hi * h_{t-1} + b_hi)
+ * - Forget gate: f_t = σ(W_if * x_t + b_if + W_hf * h_{t-1} + b_hf)
+ * - Cell gate: g_t = tanh(W_ig * x_t + b_ig + W_hg * h_{t-1} + b_hg)
+ * - Output gate: o_t = σ(W_io * x_t + b_io + W_ho * h_{t-1} + b_ho)
+ * - Cell state: c_t = f_t * c_{t-1} + i_t * g_t
+ * - Hidden state: h_t = o_t * tanh(c_t)
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html | PyTorch LSTM}
+ */
+declare class LSTM extends Module {
+    private readonly inputSize;
+    private readonly hiddenSize;
+    private readonly numLayers;
+    private readonly bias;
+    private readonly batchFirst;
+    private weightsIh;
+    private weightsHh;
+    private biasIh;
+    private biasHh;
+    constructor(inputSize: number, hiddenSize: number, options?: {
+        readonly numLayers?: number;
+        readonly bias?: boolean;
+        readonly batchFirst?: boolean;
+    });
+    private sigmoid;
+    private run;
+    forward(...inputs: AnyTensor[]): Tensor;
+    /**
+     * Forward pass returning output, hidden state, and cell state.
+     * Use this method when you need the hidden/cell states.
+     */
+    forwardWithState(input: AnyTensor, hx?: AnyTensor, cx?: AnyTensor): [Tensor, [Tensor, Tensor]];
+    toString(): string;
+}
+/**
+ * GRU (Gated Recurrent Unit) layer.
+ *
+ * Applies a multi-layer GRU to an input sequence.
+ *
+ * **Gates**:
+ * - Reset gate: r_t = σ(W_ir * x_t + b_ir + W_hr * h_{t-1} + b_hr)
+ * - Update gate: z_t = σ(W_iz * x_t + b_iz + W_hz * h_{t-1} + b_hz)
+ * - New gate: n_t = tanh(W_in * x_t + b_in + r_t * (W_hn * h_{t-1} + b_hn))
+ * - Hidden: h_t = (1 - z_t) * n_t + z_t * h_{t-1}
+ *
+ * @see {@link https://pytorch.org/docs/stable/generated/torch.nn.GRU.html | PyTorch GRU}
+ */
+declare class GRU extends Module {
+    private readonly inputSize;
+    private readonly hiddenSize;
+    private readonly numLayers;
+    private readonly bias;
+    private readonly batchFirst;
+    private weightsIh;
+    private weightsHh;
+    private biasIh;
+    private biasHh;
+    constructor(inputSize: number, hiddenSize: number, options?: {
+        readonly numLayers?: number;
+        readonly bias?: boolean;
+        readonly batchFirst?: boolean;
+    });
+    private sigmoid;
+    private run;
+    forward(...inputs: AnyTensor[]): Tensor;
+    /**
+     * Forward pass returning both output and hidden state.
+     * Use this method when you need the hidden state.
+     */
+    forwardWithState(input: AnyTensor, hx?: AnyTensor): [Tensor, Tensor];
+    toString(): string;
+}
+/**
+ * Cross Entropy Loss.
+ *
+ * Computes the cross entropy loss between predictions and targets.
+ * Commonly used for multi-class classification problems.
+ *
+ * Supports both integer class indices and one-hot encoded probabilities for targets.
+ *
+ * **Formula**: L = -mean(sum(target * log_softmax(input), dim=1))
+ *
+ * @param input - Predicted logits of shape (n_samples, n_classes)
+ * @param target - True labels. Either:
+ *                 - Class indices of shape (n_samples,)
+ *                 - Probabilities/One-hot of shape (n_samples, n_classes)
+ * @returns Scalar loss value (GradTensor)
+ *
+ * @example
+ * ```ts
+ * import { crossEntropyLoss } from 'deepbox/nn';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const pred = tensor([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1]]);
+ * const true_idx = tensor([0, 1]);
+ * const loss = crossEntropyLoss(pred, true_idx);
+ * ```
+ */
+declare function crossEntropyLoss(input: Tensor, target: Tensor): number;
+declare function crossEntropyLoss(input: GradTensor, target: AnyTensor): GradTensor;
+/**
+ * Binary Cross Entropy Loss with logits.
+ *
+ * Combines sigmoid activation and binary cross entropy loss for numerical stability.
+ *
+ * @param input - Predicted logits of shape (n_samples,) or (n_samples, 1)
+ * @param target - True binary labels of same shape as input
+ * @returns Scalar loss value (GradTensor)
+ */
+declare function binaryCrossEntropyWithLogitsLoss(input: Tensor, target: Tensor): number;
+declare function binaryCrossEntropyWithLogitsLoss(input: GradTensor, target: AnyTensor): GradTensor;
+/**
+ * Mean Squared Error (MSE) loss function.
+ *
+ * **Mathematical Formula:**
+ * ```
+ * MSE = mean((y_pred - y_true)^2)
+ * ```
+ *
+ * **Use Cases:**
+ * - Regression tasks
+ * - Continuous value prediction
+ * - Measuring distance between predictions and targets
+ *
+ * **Properties:**
+ * - Always non-negative
+ * - Penalizes large errors more heavily (quadratic)
+ * - Differentiable everywhere
+ *
+ * @param predictions - Predicted values
+ * @param targets - True target values
+ * @param reduction - How to reduce the loss: 'mean', 'sum', or 'none'
+ * @returns Scalar loss value (or tensor if reduction='none')
+ *
+ * @example
+ * ```ts
+ * import { mseLoss } from 'deepbox/nn/losses';
+ * import { tensor } from 'deepbox/ndarray';
+ *
+ * const predictions = tensor([2.5, 0.0, 2.1, 7.8]);
+ * const targets = tensor([3.0, -0.5, 2.0, 8.0]);
+ * const loss = mseLoss(predictions, targets); // Scalar tensor
+ * ```
+ *
+ * @category Loss Functions
+ */
+declare function mseLoss(predictions: Tensor, targets: Tensor, reduction?: "mean" | "sum" | "none"): Tensor;
+/**
+ * Mean Absolute Error (MAE) loss function, also known as L1 loss.
+ *
+ * **Mathematical Formula:**
+ * ```
+ * MAE = mean(|y_pred - y_true|)
+ * ```
+ *
+ * **Use Cases:**
+ * - Regression tasks where outliers should have less influence
+ * - More robust to outliers than MSE
+ *
+ * **Properties:**
+ * - Always non-negative
+ * - Linear penalty for errors
+ * - Less sensitive to outliers than MSE
+ *
+ * @param predictions - Predicted values
+ * @param targets - True target values
+ * @param reduction - How to reduce the loss: 'mean', 'sum', or 'none'
+ * @returns Scalar loss value (or tensor if reduction='none')
+ *
+ * @category Loss Functions
+ */
+declare function maeLoss(predictions: Tensor, targets: Tensor, reduction?: "mean" | "sum" | "none"): Tensor;
+/**
+ * Binary Cross-Entropy (BCE) loss function.
+ *
+ * **Mathematical Formula:**
+ * ```
+ * BCE = -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
+ * ```
+ *
+ * **Use Cases:**
+ * - Binary classification tasks
+ * - Multi-label classification (independent binary decisions)
+ * - Predictions should be probabilities in (0, 1)
+ *
+ * **Properties:**
+ * - Requires predictions in range (0, 1) - use sigmoid activation
+ * - Targets should be 0 or 1
+ * - Numerically stable with epsilon for log
+ *
+ * @param predictions - Predicted probabilities (0 to 1)
+ * @param targets - True binary labels (0 or 1)
+ * @param reduction - How to reduce the loss: 'mean', 'sum', or 'none'
+ * @returns Scalar loss value (or tensor if reduction='none')
+ *
+ * @category Loss Functions
+ */
+declare function binaryCrossEntropyLoss(predictions: Tensor, targets: Tensor, reduction?: "mean" | "sum" | "none"): Tensor;
+/**
+ * Root Mean Squared Error (RMSE) loss function.
+ *
+ * **Mathematical Formula:**
+ * ```
+ * RMSE = sqrt(mean((y_pred - y_true)^2))
+ * ```
+ *
+ * **Use Cases:**
+ * - Regression tasks
+ * - When you want error in same units as target
+ * - More interpretable than MSE
+ *
+ * @param predictions - Predicted values
+ * @param targets - True target values
+ * @returns Scalar loss value
+ *
+ * @category Loss Functions
+ */
+declare function rmseLoss(predictions: Tensor, targets: Tensor): Tensor;
+/**
+ * Huber loss function - combines MSE and MAE.
+ *
+ * **Mathematical Formula:**
+ * ```
+ * Huber(a) = 0.5 * a^2           if |a| <= delta
+ *          = delta * (|a| - 0.5 * delta)  otherwise
+ * where a = y_pred - y_true
+ * ```
+ *
+ * **Use Cases:**
+ * - Regression with outliers
+ * - Robust to outliers while maintaining MSE benefits for small errors
+ *
+ * **Properties:**
+ * - Quadratic for small errors (like MSE)
+ * - Linear for large errors (like MAE)
+ * - Controlled by delta parameter
+ *
+ * @param predictions - Predicted values
+ * @param targets - True target values
+ * @param delta - Threshold where loss transitions from quadratic to linear
+ * @param reduction - How to reduce the loss: 'mean', 'sum', or 'none'
+ * @returns Scalar loss value (or tensor if reduction='none')
+ *
+ * @category Loss Functions
+ */
+declare function huberLoss(predictions: Tensor, targets: Tensor, delta?: number, reduction?: "mean" | "sum" | "none"): Tensor;
+type index_AvgPool2d = AvgPool2d;
+declare const index_AvgPool2d: typeof AvgPool2d;
+type index_BatchNorm1d = BatchNorm1d;
+declare const index_BatchNorm1d: typeof BatchNorm1d;
+type index_Conv1d = Conv1d;
+declare const index_Conv1d: typeof Conv1d;
+type index_Conv2d = Conv2d;
+declare const index_Conv2d: typeof Conv2d;
+type index_Dropout = Dropout;
+declare const index_Dropout: typeof Dropout;
+type index_ELU = ELU;
+declare const index_ELU: typeof ELU;
+type index_ForwardHook = ForwardHook;
+type index_ForwardPreHook = ForwardPreHook;
+type index_GELU = GELU;
+declare const index_GELU: typeof GELU;
+type index_GRU = GRU;
+declare const index_GRU: typeof GRU;
+type index_LSTM = LSTM;
+declare const index_LSTM: typeof LSTM;
+type index_LayerNorm = LayerNorm;
+declare const index_LayerNorm: typeof LayerNorm;
+type index_LeakyReLU = LeakyReLU;
+declare const index_LeakyReLU: typeof LeakyReLU;
+type index_Linear = Linear;
+declare const index_Linear: typeof Linear;
+type index_LogSoftmax = LogSoftmax;
+declare const index_LogSoftmax: typeof LogSoftmax;
+type index_MaxPool2d = MaxPool2d;
+declare const index_MaxPool2d: typeof MaxPool2d;
+type index_Mish = Mish;
+declare const index_Mish: typeof Mish;
+type index_Module = Module;
+declare const index_Module: typeof Module;
+type index_MultiheadAttention = MultiheadAttention;
+declare const index_MultiheadAttention: typeof MultiheadAttention;
+type index_RNN = RNN;
+declare const index_RNN: typeof RNN;
+type index_ReLU = ReLU;
+declare const index_ReLU: typeof ReLU;
+type index_Sequential = Sequential;
+declare const index_Sequential: typeof Sequential;
+type index_Sigmoid = Sigmoid;
+declare const index_Sigmoid: typeof Sigmoid;
+type index_Softmax = Softmax;
+declare const index_Softmax: typeof Softmax;
+type index_Softplus = Softplus;
+declare const index_Softplus: typeof Softplus;
+type index_Swish = Swish;
+declare const index_Swish: typeof Swish;
+type index_Tanh = Tanh;
+declare const index_Tanh: typeof Tanh;
+type index_TransformerEncoderLayer = TransformerEncoderLayer;
+declare const index_TransformerEncoderLayer: typeof TransformerEncoderLayer;
+declare const index_binaryCrossEntropyLoss: typeof binaryCrossEntropyLoss;
+declare const index_binaryCrossEntropyWithLogitsLoss: typeof binaryCrossEntropyWithLogitsLoss;
+declare const index_crossEntropyLoss: typeof crossEntropyLoss;
+declare const index_huberLoss: typeof huberLoss;
+declare const index_maeLoss: typeof maeLoss;
+declare const index_mseLoss: typeof mseLoss;
+declare const index_rmseLoss: typeof rmseLoss;
+declare namespace index {
+  export { index_AvgPool2d as AvgPool2d, index_BatchNorm1d as BatchNorm1d, index_Conv1d as Conv1d, index_Conv2d as Conv2d, index_Dropout as Dropout, index_ELU as ELU, type index_ForwardHook as ForwardHook, type index_ForwardPreHook as ForwardPreHook, index_GELU as GELU, index_GRU as GRU, index_LSTM as LSTM, index_LayerNorm as LayerNorm, index_LeakyReLU as LeakyReLU, index_Linear as Linear, index_LogSoftmax as LogSoftmax, index_MaxPool2d as MaxPool2d, index_Mish as Mish, index_Module as Module, index_MultiheadAttention as MultiheadAttention, index_RNN as RNN, index_ReLU as ReLU, index_Sequential as Sequential, index_Sigmoid as Sigmoid, index_Softmax as Softmax, index_Softplus as Softplus, index_Swish as Swish, index_Tanh as Tanh, index_TransformerEncoderLayer as TransformerEncoderLayer, index_binaryCrossEntropyLoss as binaryCrossEntropyLoss, index_binaryCrossEntropyWithLogitsLoss as binaryCrossEntropyWithLogitsLoss, index_crossEntropyLoss as crossEntropyLoss, index_huberLoss as huberLoss, index_maeLoss as maeLoss, index_mseLoss as mseLoss, index_rmseLoss as rmseLoss };
+}
+export { AvgPool2d as A, BatchNorm1d as B, Conv1d as C, Dropout as D, ELU as E, type ForwardHook as F, GELU as G, LeakyReLU as L, Mish as M, ReLU as R, Sequential as S, Tanh as T, LogSoftmax as a, Sigmoid as b, Softmax as c, Softplus as d, Swish as e, MultiheadAttention as f, TransformerEncoderLayer as g, Conv2d as h, index as i, MaxPool2d as j, Linear as k, LayerNorm as l, GRU as m, LSTM as n, RNN as o, binaryCrossEntropyLoss as p, binaryCrossEntropyWithLogitsLoss as q, crossEntropyLoss as r, huberLoss as s, maeLoss as t, mseLoss as u, rmseLoss as v, type ForwardPreHook as w, Module as x };