PyPI - mlpack - Versions diffs - 4.6.2__cp39-cp39-win_amd64.whl → 4.7.0__cp39-cp39-win_amd64.whl - Mend

mlpack 4.6.2__cp39-cp39-win_amd64.whl → 4.7.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (417) hide show

mlpack/include/mlpack/methods/ann/layer/multihead_attention_impl.hpp CHANGED Viewed

@@ -22,8 +22,8 @@
 namespace mlpack {
 template <typename MatType, typename RegularizerType>
-MultiheadAttentionType<MatType, RegularizerType>::
-MultiheadAttentionType() :
+MultiheadAttention<MatType, RegularizerType>::
+MultiheadAttention() :
     tgtSeqLen(0),
     srcSeqLen(0),
     embedDim(0),
@@ -35,11 +35,11 @@ MultiheadAttentionType() :
 }
 template <typename MatType, typename RegularizerType>
-MultiheadAttentionType<MatType, RegularizerType>::
-MultiheadAttentionType(
+MultiheadAttention<MatType, RegularizerType>::
+MultiheadAttention(
     const size_t tgtSeqLen,
     const size_t numHeads,
-    const MatType& attnmask,
+    const CubeType& attnmask,
     const MatType& keypaddingmask,
     const bool selfAttention) :
     tgtSeqLen(tgtSeqLen),
@@ -53,7 +53,7 @@ MultiheadAttentionType(
 }
 template <typename MatType, typename RegularizerType>
-void MultiheadAttentionType<MatType, RegularizerType>::SetWeights(
+void MultiheadAttention<MatType, RegularizerType>::SetWeights(
     const MatType& weightsIn)
 {
   MakeAlias(weights, weightsIn, (4 * embedDim + 4) * embedDim, 1);
@@ -70,7 +70,7 @@ void MultiheadAttentionType<MatType, RegularizerType>::SetWeights(
 }
 template <typename MatType, typename RegularizerType>
-void MultiheadAttentionType<MatType, RegularizerType>::
+void MultiheadAttention<MatType, RegularizerType>::
 Forward(const MatType& input, MatType& output)
 {
   if (input.n_rows != embedDim *
@@ -122,7 +122,7 @@ Forward(const MatType& input, MatType& output)
   // The scaling factor sqrt(headDim) is used to prevent exploding values
   // after dot product i.e. when qProj is multiplied with kProj.
-  qProj /= std::sqrt(headDim);
+  qProj /= ElemType(std::sqrt(headDim));
   // Split the qProj, kProj and vProj into n heads. That's what Multihead
   // Attention is.
@@ -131,40 +131,16 @@ Forward(const MatType& input, MatType& output)
   vProj.reshape(srcSeqLen, headDim, numHeads * batchSize);
   // Calculate the scores i.e. perform the matrix multiplication operation
-  // on qProj and kProj. Here score = qProj . kProj'
-  scores = MultiplyCube2Cube(qProj, kProj, false, true);
-  // Apply the attention mask if provided. The attention mask is used to black-
-  // out future sequences and generally used in Encoder-Decoder attention.
-  // The attention mask has elements -inf or 0.
-  // The shape of the attention mask : (tgtSeqLen, srcSeqLen).
-  if (!attnMask.is_empty())
-  {
-    if (attnMask.n_rows != tgtSeqLen || attnMask.n_cols != srcSeqLen)
-      Log::Fatal << "The size of the 'attn_mask' is not correct.\n";
-    scores.each_slice() += attnMask;
-  }
-  // Apply the key padding mask when provided. It blacks-out any particular
-  // word in the sequence.
-  // The key padding mask has elements -inf or 0
-  // The shape of keyPaddingMask : (1, srcSeqLen).
-  if (!keyPaddingMask.is_empty())
-  {
-    if (keyPaddingMask.n_rows != 1 || keyPaddingMask.n_cols != srcSeqLen)
-        Log::Fatal << "The size of the 'keyPaddingMask' is not correct.\n";
-    scores.each_slice() += repmat(keyPaddingMask, tgtSeqLen, 1);
-  }
+  // on qProj and kProj. Here score = kProj . qProj'
+  scores = MultiplyCube2Cube(kProj, qProj, false, true);
-  for (size_t i = 0; i < numHeads * batchSize; ++i)
-  {
-    softmax.Forward(scores.slice(i), scores.slice(i));
-  }
+  // Apply softmax to non-masked elements.
+  MaskedForwardSoftmax(scores, numHeads, batchSize, attnMask, keyPaddingMask);
   // Calculate the attention output i.e. matrix multiplication of softmax
   // output and vProj.
   // The shape of attnOutput : (tgtSeqLen, headDim, numHeads * batchSize).
-  attnOut = MultiplyCube2Cube(scores, vProj, false, false);
+  attnOut = MultiplyCube2Cube(scores, vProj, true, false);
   // Now we will concatenate output of all the heads i.e. we will reshape
   // attnOut to (tgtSeqLen, embedDim, batchSize).
@@ -173,13 +149,13 @@ Forward(const MatType& input, MatType& output)
   // The final output is the linear projection of attention output.
   for (size_t i = 0; i < batchSize; ++i)
   {
-    output.col(i) = vectorise(trans(attnOut.slice(i) * outWt
+    output.col(i) = vectorise(trans(attnOut.slice(i) * outWt.t()
         + repmat(outBias, tgtSeqLen, 1)));
   }
 }
 template <typename MatType, typename RegularizerType>
-void MultiheadAttentionType<MatType, RegularizerType>::
+void MultiheadAttention<MatType, RegularizerType>::
 Backward(const MatType& /* input */,
          const MatType& /* output */,
          const MatType& gy,
@@ -207,7 +183,7 @@ Backward(const MatType& /* input */,
   // The shape of gyTemp : (embedDim, tgtSeqLen, batchSize).
   // The shape of outWt : (embedDim, embedDim).
   // The shape of the result : (tgtSeqLen, embedDim, batchSize).
-  gyTemp = MultiplyCube2Mat(gyTemp, outWt, true, true);
+  gyTemp = MultiplyCube2Mat(gyTemp, outWt, true, false);
   // Now since the shape of gyTemp is (tgtSeqLen, embedDim, batchSize). We will
   // split it into n heads.
@@ -216,9 +192,9 @@ Backward(const MatType& /* input */,
   // Obtain backpropagted error of value.
   // Shape of gyTemp : (tgtSeqLen, headDim, numHeads * batchSize).
-  // Shape of scores : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // Shape of scores : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The shape of tmp : (srcSeqLen, headDim, numHeads * batchSize).
-  CubeType tmp = MultiplyCube2Cube(scores, gyTemp, true, false);
+  CubeType tmp = MultiplyCube2Cube(scores, gyTemp, false, false);
   // Concatenate results of all the attention heads.
   tmp.reshape(srcSeqLen, embedDim, batchSize);
@@ -239,8 +215,8 @@ Backward(const MatType& /* input */,
   // The shape of gyTemp : (tgtSeqLen, headDim, numHeads * batchSize).
   // The shape of vProj : (srcSeqLen, headDim, numHeads * batchSize).
-  // So the new shape of gyTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
-  gyTemp = MultiplyCube2Cube(gyTemp, vProj, false, true);
+  // So the new shape of gyTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
+  gyTemp = MultiplyCube2Cube(vProj, gyTemp, false, true);
   for (size_t i = 0; i < numHeads * batchSize; ++i)
   {
@@ -251,9 +227,9 @@ Backward(const MatType& /* input */,
   // Obtain backpropagated error of key.
   // The shape of qProj : (tgtSeqLen, headDim, numHeads * batchSize).
-  // The shape of gyTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // The shape of gyTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The new shape of tmp : (srcSeqLen, headDim, numHeads * batchSize).
-  tmp = MultiplyCube2Cube(gyTemp, qProj, true, false);
+  tmp = MultiplyCube2Cube(gyTemp, qProj, false, false);
   // Concatenate results of all the attention heads.
   tmp.reshape(srcSeqLen, embedDim, batchSize);
@@ -276,9 +252,10 @@ Backward(const MatType& /* input */,
   // Obtain backpropagated error of the query.
   // The shape of kProj : (srcSeqLen, headDim, numHeads * batchSize).
-  // The shape of gyTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // The shape of gyTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The new shape of tmp : (tgtSeqLen, headDim, numHeads * batchSize).
-  tmp = MultiplyCube2Cube(gyTemp, kProj) / std::sqrt(headDim);
+  tmp = MultiplyCube2Cube(gyTemp, kProj, true, false) /
+      ElemType(std::sqrt(headDim));
   // Concatenate results of all the attention heads.
   tmp.reshape(tgtSeqLen, embedDim, batchSize);
@@ -300,7 +277,7 @@ Backward(const MatType& /* input */,
 }
 template <typename MatType, typename RegularizerType>
-void MultiheadAttentionType<MatType, RegularizerType>::
+void MultiheadAttention<MatType, RegularizerType>::
 Gradient(const MatType& input,
          const MatType& error,
          MatType& gradient)
@@ -327,7 +304,7 @@ Gradient(const MatType& input,
   const size_t wtSize = embedDim * embedDim;
   // The shape of gradient : (4 * embedDim * embedDim + 4 * embedDim, 1).
-  gradient.set_size(arma::size(weights));
+  gradient.set_size(size(weights));
   const CubeType q, k, v;
   MakeAlias(const_cast<CubeType&>(q), input, embedDim, tgtSeqLen, batchSize,
@@ -356,22 +333,23 @@ Gradient(const MatType& input,
   // Gradient wrt. outWt, i.e. dL/d(outWt). We will take sum of gyTemp along
   // the slices and vectorise the output.
-  gradient.rows(3 * wtSize, 4 * wtSize - 1) = vectorise(sum(gyTemp, 2));
+  CubeType tmpCube = sum(gyTemp, 2);
+  gradient.rows(3 * wtSize, 4 * wtSize - 1) = vectorise(tmpCube.slice(0).t());
   // Partial derivative wrt. attnOut.
   // The shape of outWt : (embedDim, embedDim).
   // The shape of errorTemp : (embedDim, tgtSeqLen, batchSize).
   // The shape of gyTemp : (tgtSeqLen, embedDim, batchSize).
-  gyTemp = MultiplyCube2Mat(errorTemp, outWt, true, true);
+  gyTemp = MultiplyCube2Mat(errorTemp, outWt, true, false);
   // Now we will split it into n heads i.e. reshape it into a cube of shape
   // (tgtSeqLen, headDim, numHeads * batchSize).
   gyTemp.reshape(tgtSeqLen, headDim, numHeads * batchSize);
   // Shape of gyTemp : (tgtSeqLen, headDim, numHeads * batchSize).
-  // Shape of scores : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // Shape of scores : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The new shape of errorTemp : (srcSeqLen, headDim, numHeads * batchSize).
-  errorTemp = MultiplyCube2Cube(scores, gyTemp, true, false);
+  errorTemp = MultiplyCube2Cube(scores, gyTemp, false, false);
   // Now we will concatenate the propagated errors from all heads i.e. we
   // will reshape errorTemp to (srcSeqLen, embedDim, batchSize).
@@ -393,22 +371,23 @@ Gradient(const MatType& input,
   // Now, the shape of gyTemp : (tgtSeqLen, headDim, numHeads * batchSize).
   // The shape of vProj : (srcSeqLen, headDim, numHeads * batchSize).
-  // The new shape of errorTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
-  errorTemp = MultiplyCube2Cube(gyTemp, vProj, false, true);
+  // The new shape of errorTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
+  errorTemp = MultiplyCube2Cube(vProj, gyTemp, false, true);
   for (size_t i = 0; i < numHeads * batchSize; ++i)
   {
-    // The shape of scores : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
-    // The shape of errorTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+    // The shape of scores : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
+    // The shape of errorTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
     // The new shape of errorTemp remain same.
     softmax.Backward({} /* unused */, scores.slice(i), errorTemp.slice(i),
         errorTemp.slice(i));
   }
   // The shape of qProj : (tgtSeqLen, headDim, numHeads * batchSize).
-  // The shape of errorTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // The shape of errorTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The shape of gyTemp : (srcSeqLen, headDim, numHeads * batchSize).
-  gyTemp = MultiplyCube2Cube(errorTemp, qProj, true, false);
+  gyTemp = MultiplyCube2Cube(errorTemp, qProj, false, false);
   // We will now conctenate the propagated errors from all heads.
   // The new shape of gyTemp : (srcSeqLen, embedDim, batchSize).
@@ -429,13 +408,13 @@ Gradient(const MatType& input,
   gradient.rows(wtSize, 2 * wtSize - 1) = vectorise(sum(gyTemp, 2));
   // The shape of kProj : (srcSeqLen, headDim, numHeads * batchSize).
-  // The shape of errorTemp : (tgtSeqLen, srcSeqLen, numHeads * batchSize).
+  // The shape of errorTemp : (srcSeqLen, tgtSeqLen, numHeads * batchSize).
   // The shape of gyTemp : (tgtSeqLen, headDim, numHeads * batchSize).
-  gyTemp = MultiplyCube2Cube(errorTemp, kProj, false, false);
+  gyTemp = MultiplyCube2Cube(errorTemp, kProj, true, false);
   // Now, we will concatenate propagated error of all heads.
   gyTemp.reshape(tgtSeqLen, embedDim, batchSize);
-  gyTemp /= std::sqrt(headDim);
+  gyTemp /= ElemType(std::sqrt(headDim));
   // Gradient wrt. qBias, i.e. dL/d(qBias). We will take summation over all the
   // batches of gyTemp and over all the sequences.
@@ -457,7 +436,7 @@ Gradient(const MatType& input,
 template <typename MatType, typename RegularizerType>
 template <typename Archive>
-void MultiheadAttentionType<MatType, RegularizerType>::
+void MultiheadAttention<MatType, RegularizerType>::
 serialize(Archive& ar, const uint32_t /* version */)
 {
   ar(cereal::base_class<Layer<MatType>>(this));
@@ -492,6 +471,124 @@ serialize(Archive& ar, const uint32_t /* version */)
   }
 }
+template<typename MatType, typename RegularizerType>
+void MultiheadAttention<MatType, RegularizerType>::MaskedForwardSoftmax(
+    CubeType& scores,
+    const size_t numHeads,
+    const size_t batchSize,
+    const CubeType& attnMask,
+    const MatType& keyPaddingMask)
+{
+  if (attnMask.empty() && keyPaddingMask.empty())
+  {
+    // No masking required: we can use the simple implementation.
+    for (size_t i = 0; i < scores.n_slices; ++i)
+    {
+      scores.slice(i) = exp(scores.slice(i).each_row() -
+          max(scores.slice(i), 0));
+      scores.slice(i).each_row() /= sum(scores.slice(i), 0);
+    }
+  }
+  else if (attnMask.empty() && !keyPaddingMask.empty())
+  {
+    // There is one key padding mask column for each element in the batch.
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+      for (size_t h = 0; h < numHeads; ++h)
+      {
+        const size_t s = i * numHeads + h;
+        for (size_t c = 0; c < scores.n_cols; ++c)
+        {
+          ElemType maxVal = std::numeric_limits<ElemType>::lowest();
+          for (size_t r = 0; r < scores.n_rows; ++r)
+            if (keyPaddingMask(r, i) >= ElemType(0) && scores(r, c, s) > maxVal)
+              maxVal = scores(r, c, s);
+          for (size_t r = 0; r < scores.n_rows; ++r)
+          {
+            if (keyPaddingMask(r, i) < ElemType(0))
+              scores(r, c, s) = ElemType(0);
+            else
+              scores(r, c, s) = std::exp(scores(r, c, s) - maxVal);
+          }
+          if (maxVal != std::numeric_limits<ElemType>::lowest())
+            scores.slice(s).col(c) /= sum(scores.slice(s).col(c));
+        }
+      }
+    }
+  }
+  else if (!attnMask.empty() && keyPaddingMask.empty())
+  {
+    // There is one attention mask for each element in the batch.
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+      for (size_t h = 0; h < numHeads; ++h)
+      {
+        const size_t s = i * numHeads + h;
+        for (size_t c = 0; c < scores.n_cols; ++c)
+        {
+          ElemType maxVal = std::numeric_limits<ElemType>::lowest();
+          for (size_t r = 0; r < scores.n_rows; ++r)
+            if (attnMask(r, c, i) >= ElemType(0) && scores(r, c, s) > maxVal)
+              maxVal = scores(r, c, s);
+          for (size_t r = 0; r < scores.n_rows; ++r)
+          {
+            if (attnMask(r, c, i) < ElemType(0))
+              scores(r, c, s) = ElemType(0);
+            else
+              scores(r, c, s) = std::exp(scores(r, c, s) - maxVal);
+          }
+          if (maxVal != std::numeric_limits<ElemType>::lowest())
+            scores.slice(s).col(c) /= sum(scores.slice(s).col(c));
+        }
+      }
+    }
+  }
+  else // !attnMask.empty() && !keyPaddingMask.empty()
+  {
+    // There is one key padding mask column for each element in the batch, and
+    // one attention mask for each element in the batch.
+    for (size_t i = 0; i < batchSize; ++i)
+    {
+      for (size_t h = 0; h < numHeads; ++h)
+      {
+        const size_t s = i * numHeads + h;
+        for (size_t c = 0; c < scores.n_cols; ++c)
+        {
+          ElemType maxVal = std::numeric_limits<ElemType>::lowest();
+          for (size_t r = 0; r < scores.n_rows; ++r)
+          {
+            if (attnMask(r, c, i) >= ElemType(0) &&
+                keyPaddingMask(r, i) >= ElemType(0) &&
+                scores(r, c, s) > maxVal)
+            {
+              maxVal = scores(r, c, s);
+            }
+          }
+          for (size_t r = 0; r < scores.n_rows; ++r)
+          {
+            if (attnMask(r, c, i) < ElemType(0) ||
+                keyPaddingMask(r, i) < ElemType(0))
+              scores(r, c, s) = ElemType(0);
+            else
+              scores(r, c, s) = std::exp(scores(r, c, s) - maxVal);
+          }
+          if (maxVal != std::numeric_limits<ElemType>::lowest())
+            scores.slice(s).col(c) /= sum(scores.slice(s).col(c));
+        }
+      }
+    }
+  }
+}
 } // namespace mlpack
 #endif

mlpack/include/mlpack/methods/ann/layer/nearest_interpolation.hpp CHANGED Viewed

@@ -1,4 +1,3 @@
-//
 /**
  * @filer methods/ann/layer/nearest_interpolation.hpp
  * @author Andrew Furey
@@ -29,14 +28,18 @@ namespace mlpack {
  *         arma::sp_mat or arma::cube).
  */
 template<typename MatType = arma::mat>
-class NearestInterpolationType : public Layer<MatType>
+class NearestInterpolation : public Layer<MatType>
 {
  public:
+  // Convenience typedefs.
+  using ElemType = typename MatType::elem_type;
   using CubeType = typename GetCubeType<MatType>::type;
-  //! Create the NearestInterpolation object.
-  NearestInterpolationType();
-  /**Create NearestInterpolation Object with the same scaleFactor along
+  // Create the NearestInterpolation object.
+  NearestInterpolation();
+  /**
+   * Create NearestInterpolation Object with the same scaleFactor along
    * each dimension.
    * NOTE: scaleFactors must be a two element vector, the first element
    * for scaling the first dimension and the second element for scaling
@@ -44,25 +47,25 @@ class NearestInterpolationType : public Layer<MatType>
    *
    * If the input dimensions are n x m x ..., then the output dimensions
    * will be (n x scaleFactors[0]) x (m x scaleFactors[1]) x ...
-   *
+   *
    * @param scaleFactor Scale factors to scale each dimension by.
    */
-  NearestInterpolationType(const std::vector<double> scaleFactors);
+  NearestInterpolation(const std::vector<double> scaleFactors);
-  NearestInterpolationType* Clone() const {
-    return new NearestInterpolationType(*this);
+  NearestInterpolation* Clone() const {
+    return new NearestInterpolation(*this);
   }
-  virtual ~NearestInterpolationType() { }
+  virtual ~NearestInterpolation() { }
-  //! Copy the given NearestInterpolationType layer.
-  NearestInterpolationType(const NearestInterpolationType& other);
-  //! Take ownership of the given NearestInterpolationType layer.
-  NearestInterpolationType(NearestInterpolationType&& other);
-  //! Copy the given NearestInterpolationType layer.
-  NearestInterpolationType& operator=(const NearestInterpolationType& other);
-  //! Take ownership of the given NearestInterpolationType layer.
-  NearestInterpolationType& operator=(NearestInterpolationType&& other);
+  //! Copy the given NearestInterpolation layer.
+  NearestInterpolation(const NearestInterpolation& other);
+  //! Take ownership of the given NearestInterpolation layer.
+  NearestInterpolation(NearestInterpolation&& other);
+  //! Copy the given NearestInterpolation layer.
+  NearestInterpolation& operator=(const NearestInterpolation& other);
+  //! Take ownership of the given NearestInterpolation layer.
+  NearestInterpolation& operator=(NearestInterpolation&& other);
   /**
    * Forward pass through the layer. The layer interpolates
@@ -81,12 +84,14 @@ class NearestInterpolationType : public Layer<MatType>
    * the input size.
    *
    * @param * (input) The input matrix.
-   * @param gradient The computed backward gradient.
-   * @param output The resulting down-sampled output.
+   * @param * (output) The output matrix.
+   * @param gy The computed backward gradient.
+   * @param g The resulting down-sampled output.
    */
-  void Backward(const MatType& /*input*/,
-                const MatType& gradient,
-                MatType& output);
+  void Backward(const MatType& /* input */,
+                const MatType& /* output */,
+                const MatType& gy,
+                MatType& g);
   //! Compute the output dimensions of the layer, based on the internal values
   //! of `InputDimensions()`.
@@ -103,8 +108,6 @@ class NearestInterpolationType : public Layer<MatType>
   std::vector<double> scaleFactors;
 }; // class NearestInterpolation
-using NearestInterpolation = NearestInterpolationType<arma::mat>;
 } // namespace mlpack
 // Include implementation.

mlpack/include/mlpack/methods/ann/layer/nearest_interpolation_impl.hpp CHANGED Viewed

@@ -19,16 +19,16 @@
 namespace mlpack {
 template<typename MatType>
-NearestInterpolationType<MatType>::NearestInterpolationType():
-  Layer<MatType>()
+NearestInterpolation<MatType>::NearestInterpolation():
+    Layer<MatType>()
 {
   // Nothing to do here.
 }
 template<typename MatType>
-NearestInterpolationType<MatType>::
-NearestInterpolationType(const std::vector<double> scaleFactors) :
-  Layer<MatType>()
+NearestInterpolation<MatType>::
+NearestInterpolation(const std::vector<double> scaleFactors) :
+    Layer<MatType>()
 {
   if (scaleFactors.size() != 2) {
     throw std::runtime_error("Scale factors must have 2 dimensions");
@@ -37,27 +37,27 @@ NearestInterpolationType(const std::vector<double> scaleFactors) :
 }
 template<typename MatType>
-NearestInterpolationType<MatType>::
-NearestInterpolationType(const NearestInterpolationType& other) :
-  Layer<MatType>(),
-  scaleFactors(other.scaleFactors)
+NearestInterpolation<MatType>::
+NearestInterpolation(const NearestInterpolation& other) :
+    Layer<MatType>(),
+    scaleFactors(other.scaleFactors)
 {
   // Nothing to do here.
 }
 template<typename MatType>
-NearestInterpolationType<MatType>::
-NearestInterpolationType(NearestInterpolationType&& other) :
-  Layer<MatType>(std::move(other)),
-  scaleFactors(std::move(other.scaleFactors))
+NearestInterpolation<MatType>::
+NearestInterpolation(NearestInterpolation&& other) :
+    Layer<MatType>(std::move(other)),
+    scaleFactors(std::move(other.scaleFactors))
 {
   // Nothing to do here.
 }
 template<typename MatType>
-NearestInterpolationType<MatType>&
-NearestInterpolationType<MatType>::
-operator=(const NearestInterpolationType& other)
+NearestInterpolation<MatType>&
+NearestInterpolation<MatType>::
+operator=(const NearestInterpolation& other)
 {
   if (&other != this)
   {
@@ -68,9 +68,9 @@ operator=(const NearestInterpolationType& other)
 }
 template<typename MatType>
-NearestInterpolationType<MatType>&
-NearestInterpolationType<MatType>::
-operator=(NearestInterpolationType&& other)
+NearestInterpolation<MatType>&
+NearestInterpolation<MatType>::
+operator=(NearestInterpolation&& other)
 {
   if (&other != this)
   {
@@ -81,8 +81,8 @@ operator=(NearestInterpolationType&& other)
 }
 template<typename MatType>
-void NearestInterpolationType<MatType>::Forward(
-  const MatType& input, MatType& output)
+void NearestInterpolation<MatType>::Forward(
+    const MatType& input, MatType& output)
 {
   const size_t channels = this->inputDimensions[2];
@@ -100,7 +100,7 @@ void NearestInterpolationType<MatType>::Forward(
   for (size_t i = 0; i < outRowSize; ++i)
   {
-    size_t rOrigin = std::floor(i  / scaleFactors[0]);
+    size_t rOrigin = std::floor(i / scaleFactors[0]);
     for (size_t j = 0; j < outColSize; ++j)
     {
       size_t cOrigin = std::floor(j / scaleFactors[1]);
@@ -113,10 +113,11 @@ void NearestInterpolationType<MatType>::Forward(
 }
 template<typename MatType>
-void NearestInterpolationType<MatType>::Backward(
-  const MatType& /*input*/,
-  const MatType& gradient,
-  MatType& output)
+void NearestInterpolation<MatType>::Backward(
+    const MatType& /* input */,
+    const MatType& /* output */,
+    const MatType& gy,
+    MatType& g)
 {
   const size_t channels = this->inputDimensions[2];
@@ -126,12 +127,11 @@ void NearestInterpolationType<MatType>::Backward(
   const size_t inRowSize = this->inputDimensions[0];
   const size_t inColSize = this->inputDimensions[1];
-  CubeType outputAsCube;
-  CubeType gradientAsCube;
+  CubeType gTemp;
+  CubeType gyTemp;
-  MakeAlias(outputAsCube, output, inRowSize, inColSize, channels, 0, true);
-  MakeAlias(gradientAsCube, gradient, outRowSize, outColSize, channels, 0,
-      false);
+  MakeAlias(gTemp, g, inRowSize, inColSize, channels, 0);
+  MakeAlias(gyTemp, gy, outRowSize, outColSize, channels, 0);
   for (size_t i = 0; i < outRowSize; ++i)
   {
@@ -140,15 +140,13 @@ void NearestInterpolationType<MatType>::Backward(
     {
       size_t cOrigin = std::floor(j / scaleFactors[1]);
       for (size_t k = 0; k < channels; ++k)
-      {
-        outputAsCube(rOrigin, cOrigin, k) += gradientAsCube(i, j, k);
-      }
+        gTemp(rOrigin, cOrigin, k) += gyTemp(i, j, k);
     }
   }
 }
 template<typename MatType>
-void NearestInterpolationType<MatType>::ComputeOutputDimensions()
+void NearestInterpolation<MatType>::ComputeOutputDimensions()
 {
   if (this->inputDimensions.size() < scaleFactors.size())
   {
@@ -168,9 +166,10 @@ void NearestInterpolationType<MatType>::ComputeOutputDimensions()
 template<typename MatType>
 template<typename Archive>
-void NearestInterpolationType<MatType>::serialize(
-  Archive& ar, const uint32_t /* version */)
+void NearestInterpolation<MatType>::serialize(
+    Archive& ar, const uint32_t /* version */)
 {
+  ar(cereal::base_class<Layer<MatType>>(this));
   ar(CEREAL_NVP(scaleFactors));
 }