RubyGems - convolver - Versions diffs - 0.0.2 → 0.1.0 - Mend

convolver 0.0.2 → 0.1.0

Files changed (18) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -0
data/README.md +18 -5
data/benchmarks/convolve_benchmark.rb +2 -2
data/benchmarks/convolver_vs_fftw3.rb +17 -41
data/convolver.gemspec +1 -0
data/ext/convolver/cnn_components.c +52 -0
data/ext/convolver/cnn_components.h +14 -0
data/ext/convolver/convolve_raw.c +105 -0
data/ext/convolver/convolve_raw.h +22 -0
data/ext/convolver/convolver.c +35 -162
data/ext/convolver/narray_shared.c +42 -0
data/ext/convolver/narray_shared.h +22 -0
data/lib/convolver.rb +41 -0
data/lib/convolver/version.rb +1 -1
data/spec/convolve_fftw3_spec.rb +161 -0
data/spec/helpers.rb +1 -1
metadata +24 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e20ffb64db726dac28c0b1ab5643355a67642fad
-  data.tar.gz: 278c29c4e851edf295c6f0c82043ecab98c42d6f
+  metadata.gz: 4d393cf7a6f7cd94485db0aaed706239c92ab0d0
+  data.tar.gz: 0944542524997227558ceb4f5c9b6968d3413e50
 SHA512:
-  metadata.gz: 3d8aefb6c27bf1777bea0d07f83c096a106278caf66b05519764705cfc783ebe614b71fd7f31adb43ffae7365daf18d8f4cb16cb18d5b83fea231467d571e75f
-  data.tar.gz: 0cf6c1c0fc18781580d512f56b6199f2918d476284f703fc4d2d9409e13787a7d9da22a9440714d04d114b472d5dfdcb55824b263ea3717071f61c4e4aa3714c
+  metadata.gz: 1f263ae8f88d5318f84f9479c2540d25648396d09f5bfce65dcfc13122eaff26207280afa0a459d634408b5a526d6f302ebe750aebdd3106df3c91e7ac8af681
+  data.tar.gz: 35473c019dfb69abf0a0048df1b53f0c12e0512d1ebddc4b4dab4a7ececf7d8d0c35078ad92192d350b7c34bc5c4548a9446a2e5b69b0888c1b0c877445aea8b

data/.travis.yml CHANGED

@@ -1,3 +1,6 @@
+before_install:
+ - sudo apt-get update -qq
+ - sudo apt-get install -qq libfftw3-dev
 language: ruby
 rvm:
   - "1.8.7"

data/README.md CHANGED

@@ -5,9 +5,9 @@
 Adds a convolve operation to NArray floats. It is around 250 times faster than equivalents
 in pure Ruby.
-Note that convolves based on FFTW3 could well be faster still for large arrays with large kernels.
-Benchmark tests suggest that the kernel needs to be a few hundred items, and be significantly smaller
-than the signal before FFTW3 offers a significant advantage.
+The gem makes convolution via FFTW3 library available. This is faster for convolutions with
+larger kernels and signals. The relationship is complex, but as a rule of thumb, the kernel
+needs to be around 1000 entries or larger before it is worth switching to FFTW3-based convolves.
 ## Planned features
@@ -17,6 +17,12 @@ calculating signal convolutions for other types of analysis.
 ## Installation
+### Dependency: FFTW3
+Before you install *convolver*, you should install FFTW3. See http://www.fftw.org/ for details.
+### Installing the gem
 Add this line to your application's Gemfile:
     gem 'convolver'
@@ -41,12 +47,19 @@ Basic convolution:
  * Convolver only works on single-precision floats internally. It will cast NArray types to this, if
 possible, prior to calculating.
- * The convolution is an "inner" one. The output is smaller than the input, each dimension is reduced
-by 1 less than the width of the kernel in the same dimension.
+ * The output is smaller than the input, each dimension is reduced by 1 less than the width of the
+kernel in the same dimension.
  * Convolver expects input a and kernel b to have the same rank, and for the kernel to be same size
 or smaller in all dimensions as the input.
+FFTW3 convolution:
+    a = NArray[0.3,0.4,0.5]
+    b = NArray[1.3, -0.5]
+    c = Convolver.convolve_fftw3( a, b )
+    => NArray.float(2): [ 0.19, 0.27 ]
 ## Contributing
 1. Fork it

data/benchmarks/convolve_benchmark.rb CHANGED

@@ -6,8 +6,8 @@ class Convolver2DBenchmark
   attr_reader :image, :kernel
   def initialize
-    @image = NArray.float(640, 480).random
-    @kernel = NArray.float(8, 8).random
+    @image = NArray.sfloat(640, 480).random
+    @kernel = NArray.sfloat(8, 8).random
   end
 end

data/benchmarks/convolver_vs_fftw3.rb CHANGED

@@ -1,70 +1,46 @@
 require 'convolver'
-require 'narray'
-require 'fftw3'
 require 'benchmark'
-# In Ruby for now, which is slower, but at least gets us ballpark figures (99% of the work is in the C)
-module FFTW3Convolver
-  def self.convolve orig_a, orig_b
-    combined_size = orig_a.size + orig_b.size - 1
-    left_pad_a = ( combined_size - orig_a.size + 1)/2
-    mod_a = NArray.float(combined_size)
-    mod_a[left_pad_a] = orig_a
-    mod_b = NArray.float(combined_size)
-    left_select_b = ( orig_b.size + 1 )/2
-    right_select_b = orig_b.size - left_select_b
-    mod_b[0] = orig_b[(0...left_select_b)].reverse
-    mod_b[-right_select_b] = orig_b[-right_select_b..-1].reverse
-    afft = FFTW3.fft(mod_a)
-    bfft = FFTW3.fft(mod_b)
-    cfft = afft * bfft
-    (FFTW3.ifft( cfft )/combined_size).real[left_pad_a...(left_pad_a+ orig_a.size - orig_b.size + 1)]
-  end
-end
 class Convolver2DBenchmark
   attr_reader :image, :kernel
   def initialize
     # These show Convolver.convolve as 3x faster than FFTW3
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(16 * 16).random
+    @image = NArray.sfloat(256 * 256).random
+    @kernel = NArray.sfloat(16 * 16).random
     # These are roughly even (10% advantage to FFTW3)
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(32 * 32).random
+    #  @image = NArray.sfloat(256 * 256).random
+    #  @kernel = NArray.sfloat(32 * 32).random
     # These show FFTW3 as 4x faster than Convolver.convolve
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(64 * 64).random
+    #  @image = NArray.sfloat(256 * 256).random
+    #  @kernel = NArray.sfloat(64 * 64).random
     # These show Convolver.convolve as 200x faster than FFTW3
-    # @image = NArray.float(50 * 64 * 64).random
-    # @kernel = NArray.float(50 * 64 * 64).random
+    # @image = NArray.sfloat(50 * 64 * 64).random
+    # @kernel = NArray.sfloat(50 * 64 * 64).random
     # These show FFTW3 as 2x faster than Convolver.convolve
-    # @image = NArray.float(128 * 128).random
-    # @kernel = NArray.float(64 * 64).random
+    # @image = NArray.sfloat(128 * 128).random
+    # @kernel = NArray.sfloat(64 * 64).random
     # These show FFTW3 and Convolver.convolve roughly equal
-    # @image = NArray.float(80 * 80).random
-    # @kernel = NArray.float(64 * 64).random
+    # @image = NArray.sfloat(80 * 80).random
+    # @kernel = NArray.sfloat(64 * 64).random
     # These show FFTW3 as 2x faster than Convolver.convolve
-    # @image = NArray.float(2 * 80 * 80).random
-    # @kernel = NArray.float(2 * 64 * 64).random
+    # @image = NArray.sfloat(2 * 80 * 80).random
+    # @kernel = NArray.sfloat(2 * 64 * 64).random
     # These are roughly even - increasing size of image favours FFTW3
-    @image = NArray.float(2000 + 80 * 80).random
-    @kernel = NArray.float(80 * 80).random
+    #@image = NArray.sfloat(2000 + 80 * 80).random
+    #@kernel = NArray.sfloat(80 * 80).random
   end
 end
 Benchmark.bm do |x|
   source = Convolver2DBenchmark.new
   x.report('convolver') { 100.times { Convolver.convolve( source.image, source.kernel ) } }
-  x.report('fftw3') { 100.times { FFTW3Convolver.convolve( source.image, source.kernel ) } }
+  x.report('fftw3') { 100.times { Convolver.convolve_fftw3( source.image, source.kernel ) } }
 end

data/convolver.gemspec CHANGED

@@ -14,6 +14,7 @@ Gem::Specification.new do |spec|
   spec.license       = "MIT"
   spec.add_dependency "narray", ">= 0.6.0.8"
+  spec.add_dependency "fftw3", ">= 0.3"
   spec.add_development_dependency "yard", ">= 0.8.7.2"
   spec.add_development_dependency "bundler", ">= 1.3"

data/ext/convolver/cnn_components.c ADDED

@@ -0,0 +1,52 @@
+// ext/convolver/cnn_components.c
+#include <xmmintrin.h>
+#include "cnn_components.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Run a single fully-connected layer, calculating output from input
+//
+//    Benchmark: 1024 inputs, 256 outputs. 1000 iterations. 0.56 seconds
+//
+//
+void nn_run_layer_raw( int in_size, int out_size,
+    float *in_ptr, float *weights, float *thresholds, float *out_ptr ) {
+  int i, j, in_aligned_size, out_aligned_size, offset;
+  __m128 simd_x, simd_y, simd_t;
+  in_aligned_size = 4 * ( in_size/4 );
+  out_aligned_size = 4 * ( out_size/4 );
+  // Calculate activation
+  for ( i = 0; i < out_size; i++ ) {
+    float t = 0.0;
+    simd_t = _mm_setzero_ps();
+    offset = i * in_size;
+    // Use SIMD for all the aligned values in groups of 4
+    for ( j = 0; j < in_aligned_size; j +=4 ) {
+      simd_x = _mm_load_ps( in_ptr + j );
+      // Weights might not align to 16 bytes due to size of layers
+      simd_y = _mm_loadu_ps( weights + (offset + j) );
+      simd_x = _mm_mul_ps( simd_x, simd_y );
+      simd_t = _mm_add_ps( simd_x, simd_t );
+    }
+    // Complete any remaining 1,2 or 3 items one at a time
+    for ( j = in_aligned_size; j < in_size; j++ ) {
+      t += in_ptr[ j ] * weights[ offset + j ];
+    }
+    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  }
+  for ( i = 0; i < out_size; i++ ) {
+    out_ptr[i] -= thresholds[i];
+    if ( out_ptr[i] < 0.0 ) { out_ptr[i] = 0.0; }
+  }
+  return;
+}

data/ext/convolver/cnn_components.h ADDED

@@ -0,0 +1,14 @@
+// ext/convolver/cnn_components.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CNN_COMPONENTS_H
+#define CNN_COMPONENTS_H
+void nn_run_layer_raw( int in_size, int out_size,
+    float *in_ptr, float *weights, float *thresholds, float *out_ptr );
+#endif

data/ext/convolver/convolve_raw.c ADDED

@@ -0,0 +1,105 @@
+// ext/convolver/convolve_raw.c
+#include "convolve_raw.h"
+inline int size_from_shape( int rank, int *shape ) {
+  int size = 1;
+  int i;
+  for ( i = 0; i < rank; i++ ) { size *= shape[i]; }
+  return size;
+}
+// Sets reverse indices
+inline void corner_reset( int rank, int *shape, int *rev_indices ) {
+  int i;
+  for ( i = 0; i < rank; i++ ) { rev_indices[i] = shape[i] - 1; }
+  return;
+}
+// Counts indices down, returns number of ranks that reset
+inline int corner_dec( int rank, int *shape, int *rev_indices ) {
+  int i = 0;
+  while ( ! rev_indices[i]-- ) {
+    rev_indices[i] = shape[i] - 1;
+    i++;
+  }
+  return i;
+}
+// Generates co-increment steps by rank boundaries crossed, for the outer position as inner position is incremented by 1
+inline void calc_co_increment( int rank, int *outer_shape, int *inner_shape, int *co_increment ) {
+  int i, factor;
+  co_increment[0] = 1; // co-increment is always 1 in lowest rank
+  factor = 1;
+  for ( i = 0; i < rank; i++ ) {
+    co_increment[i+1] = co_increment[i] + factor * ( outer_shape[i] - inner_shape[i] );
+    factor *= outer_shape[i];
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Convolve
+//
+//    Benchmark: 640x480 image, 8x8 kernel, 1000 iterations. 12.3 seconds.
+//
+void convolve_raw(
+    int in_rank, int *in_shape, float *in_ptr,
+    int kernel_rank, int *kernel_shape, float *kernel_ptr,
+    int out_rank, int *out_shape, float *out_ptr ) {
+  int i, j, in_size, kernel_size, kernel_aligned, out_size, offset;
+  int out_co_incr[LARGEST_RANK], kernel_co_incr[LARGEST_RANK];
+  int ker_q[LARGEST_RANK], out_q[LARGEST_RANK];
+  int *kernel_co_incr_cache;
+  in_size = size_from_shape( in_rank, in_shape );
+  kernel_size = size_from_shape( kernel_rank, kernel_shape );
+  kernel_aligned = 4 * (kernel_size/4);
+  out_size = size_from_shape( out_rank, out_shape );
+  calc_co_increment( in_rank, in_shape, out_shape, out_co_incr );
+  calc_co_increment( in_rank, in_shape, kernel_shape, kernel_co_incr );
+  kernel_co_incr_cache = ALLOC_N( int, kernel_size );
+  kernel_co_incr_cache[0] = 0;
+  corner_reset( kernel_rank, kernel_shape, ker_q );
+  for ( i = 1; i < kernel_size; i++ ) {
+    kernel_co_incr_cache[i] = kernel_co_incr_cache[i-1] + kernel_co_incr[ corner_dec( kernel_rank, kernel_shape, ker_q  ) ];
+  }
+  // For convenience of flow, we set offset to -1 and adjust countdown 1 higher to compensate
+  offset = -1;
+  corner_reset( out_rank, out_shape, out_q );
+  out_q[0]++;
+  // Main convolve loop
+  for ( i = 0; i < out_size; i++ ) {
+    __m128 simd_x, simd_y, simd_t;
+    float t = 0.0;
+    simd_t = _mm_setzero_ps();
+    offset += out_co_incr[ corner_dec( out_rank, out_shape, out_q ) ];
+    // Use SIMD for all the aligned values in groups of 4
+    for ( j = 0; j < kernel_aligned; j +=4 ) {
+      simd_x = _mm_load_ps( kernel_ptr + j );
+      // Yes the backwards alignment is correct
+      simd_y = _mm_set_ps( in_ptr[ offset + kernel_co_incr_cache[j+3] ], in_ptr[ offset + kernel_co_incr_cache[j+2] ],
+                           in_ptr[ offset + kernel_co_incr_cache[j+1] ], in_ptr[ offset + kernel_co_incr_cache[j] ] );
+      simd_x = _mm_mul_ps( simd_x, simd_y );
+      simd_t = _mm_add_ps( simd_x, simd_t );
+    }
+    // Complete any remaining 1,2 or 3 items one at a time
+    for ( j = kernel_aligned; j < kernel_size; j++ ) {
+      t += in_ptr[ offset + kernel_co_incr_cache[j] ] * kernel_ptr[ j ];
+    }
+    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  }
+  xfree( kernel_co_incr_cache );
+  return;
+}

data/ext/convolver/convolve_raw.h ADDED

@@ -0,0 +1,22 @@
+// ext/convolver/convolve_raw.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CONVOLVE_RAW_H
+#define CONVOLVE_RAW_H
+#include <ruby.h>
+#include <xmmintrin.h>
+#include "narray_shared.h"
+#define LARGEST_RANK 16
+void convolve_raw(
+    int in_rank, int *in_shape, float *in_ptr,
+    int kernel_rank, int *kernel_shape, float *kernel_ptr,
+    int out_rank, int *out_shape, float *out_ptr );
+#endif

data/ext/convolver/convolver.c CHANGED

@@ -5,181 +5,53 @@
 #include <stdio.h>
 #include <xmmintrin.h>
-#define LARGEST_RANK 16
+#include "narray_shared.h"
+#include "convolve_raw.h"
+#include "cnn_components.h"
-// This is copied from na_array.c, with safety checks and temp vars removed
-inline int na_quick_idxs_to_pos( int rank, int *shape, int *idxs ) {
-  int i, pos = 0;
-  for ( i = rank - 1; i >= 0; i-- ) {
-    pos = pos * shape[i] + idxs[i];
-  }
-  return pos;
-}
-// This is inverse of above
-inline void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs ) {
-  int i;
-  for ( i = 0; i < rank; i++ ) {
-    idxs[ i ] = pos % shape[i];
-    pos /= shape[i];
-  }
-  return;
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
-inline int size_from_shape( int rank, int *shape ) {
-  int size = 1;
-  int i;
-  for ( i = 0; i < rank; i++ ) { size *= shape[i]; }
-  return size;
-}
+// To hold the module object
+VALUE Convolver = Qnil;
-// Sets reverse indices
-inline void corner_reset( int rank, int *shape, int *rev_indices ) {
-  int i;
-  for ( i = 0; i < rank; i++ ) { rev_indices[i] = shape[i] - 1; }
-  return;
-}
+static VALUE narray_fit_backwards( VALUE self, VALUE a, VALUE b ) {
+  struct NARRAY *na_a, *na_b;
+  volatile VALUE val_a, val_b;
+  int target_rank, i;
+  int shift_by[LARGEST_RANK];
-// Counts indices down, returns number of ranks that reset
-inline int corner_dec( int rank, int *shape, int *rev_indices ) {
-  int i = 0;
-  while ( ! rev_indices[i]-- ) {
-    rev_indices[i] = shape[i] - 1;
-    i++;
-  }
-  return i;
-}
+  val_a = na_cast_object(a, NA_SFLOAT);
+  GetNArray( val_a, na_a );
-// Generates co-increment steps by rank boundaries crossed, for the outer position as inner position is incremented by 1
-inline void calc_co_increment( int rank, int *outer_shape, int *inner_shape, int *co_increment ) {
-  int i, factor;
-  co_increment[0] = 1; // co-increment is always 1 in lowest rank
-  factor = 1;
-  for ( i = 0; i < rank; i++ ) {
-    co_increment[i+1] = co_increment[i] + factor * ( outer_shape[i] - inner_shape[i] );
-    factor *= outer_shape[i];
-  }
-}
+  val_b = na_cast_object(b, NA_SFLOAT);
+  GetNArray( val_b, na_b );
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  Convolve
-//
-//    Benchmark: 640x480 image, 8x8 kernel, 1000 iterations. 12.3 seconds.
-//
-void convolve_raw(
-    int in_rank, int *in_shape, float *in_ptr,
-    int kernel_rank, int *kernel_shape, float *kernel_ptr,
-    int out_rank, int *out_shape, float *out_ptr ) {
-  int i, j, in_size, kernel_size, kernel_aligned, out_size, offset;
-  int out_co_incr[LARGEST_RANK], kernel_co_incr[LARGEST_RANK];
-  int ker_q[LARGEST_RANK], out_q[LARGEST_RANK];
-  int *kernel_co_incr_cache;
-  in_size = size_from_shape( in_rank, in_shape );
-  kernel_size = size_from_shape( kernel_rank, kernel_shape );
-  kernel_aligned = 4 * (kernel_size/4);
-  out_size = size_from_shape( out_rank, out_shape );
-  calc_co_increment( in_rank, in_shape, out_shape, out_co_incr );
-  calc_co_increment( in_rank, in_shape, kernel_shape, kernel_co_incr );
-  kernel_co_incr_cache = ALLOC_N( int, kernel_size );
-  kernel_co_incr_cache[0] = 0;
-  corner_reset( kernel_rank, kernel_shape, ker_q );
-  for ( i = 1; i < kernel_size; i++ ) {
-    kernel_co_incr_cache[i] = kernel_co_incr_cache[i-1] + kernel_co_incr[ corner_dec( kernel_rank, kernel_shape, ker_q  ) ];
+  if ( na_a->rank != na_b->rank ) {
+    rb_raise( rb_eArgError, "narray a must have equal rank to narray b (a rank %d, b rank %d)", na_a->rank,  na_b->rank );
   }
-  // For convenience of flow, we set offset to -1 and adjust countdown 1 higher to compensate
-  offset = -1;
-  corner_reset( out_rank, out_shape, out_q );
-  out_q[0]++;
-  // Main convolve loop
-  for ( i = 0; i < out_size; i++ ) {
-    __m128 simd_x, simd_y, simd_t;
-    float t = 0.0;
-    simd_t = _mm_setzero_ps();
-    offset += out_co_incr[ corner_dec( out_rank, out_shape, out_q ) ];
-    // Use SIMD for all the aligned values in groups of 4
-    for ( j = 0; j < kernel_aligned; j +=4 ) {
-      simd_x = _mm_load_ps( kernel_ptr + j );
-      // Yes the backwards alignment is correct
-      simd_y = _mm_set_ps( in_ptr[ offset + kernel_co_incr_cache[j+3] ], in_ptr[ offset + kernel_co_incr_cache[j+2] ],
-                           in_ptr[ offset + kernel_co_incr_cache[j+1] ], in_ptr[ offset + kernel_co_incr_cache[j] ] );
-      simd_x = _mm_mul_ps( simd_x, simd_y );
-      simd_t = _mm_add_ps( simd_x, simd_t );
-    }
-    // Complete any remaining 1,2 or 3 items one at a time
-    for ( j = kernel_aligned; j < kernel_size; j++ ) {
-      t += in_ptr[ offset + kernel_co_incr_cache[j] ] * kernel_ptr[ j ];
-    }
-    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  if ( na_a->rank > LARGEST_RANK ) {
+    rb_raise( rb_eArgError, "exceeded maximum narray rank for convolve of %d", LARGEST_RANK );
   }
-  xfree( kernel_co_incr_cache );
-  return;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  Neural net
-//
-//    Benchmark: 1024 inputs, 256 outputs. 1000 iterations. 0.56 seconds
-//
-//
-void nn_run_layer_raw( int in_size, int out_size,
-    float *in_ptr, float *weights, float *thresholds, float *out_ptr ) {
-  int i, j, in_aligned_size, out_aligned_size, offset;
-  __m128 simd_x, simd_y, simd_t;
-  in_aligned_size = 4 * ( in_size/4 );
-  out_aligned_size = 4 * ( out_size/4 );
-  // Calculate activation
-  for ( i = 0; i < out_size; i++ ) {
-    float t = 0.0;
-    simd_t = _mm_setzero_ps();
-    offset = i * in_size;
-    // Use SIMD for all the aligned values in groups of 4
-    for ( j = 0; j < in_aligned_size; j +=4 ) {
-      simd_x = _mm_load_ps( in_ptr + j );
-      // Weights might not align to 16 bytes due to size of layers
-      simd_y = _mm_loadu_ps( weights + (offset + j) );
-      simd_x = _mm_mul_ps( simd_x, simd_y );
-      simd_t = _mm_add_ps( simd_x, simd_t );
-    }
+  target_rank = na_a->rank;
-    // Complete any remaining 1,2 or 3 items one at a time
-    for ( j = in_aligned_size; j < in_size; j++ ) {
-      t += in_ptr[ j ] * weights[ offset + j ];
+  for ( i = 0; i < target_rank; i++ ) {
+    if ( ( na_a->shape[i] - na_b->shape[i] ) < 0 ) {
+      rb_raise( rb_eArgError, "no space for backward fit" );
     }
-    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+    shift_by[i] = na_b->shape[i] >> 1;
   }
-  for ( i = 0; i < out_size; i++ ) {
-    out_ptr[i] -= thresholds[i];
-    if ( out_ptr[i] < 0.0 ) { out_ptr[i] = 0.0; }
-  }
+  fit_backwards_raw(
+    target_rank,
+    na_a->shape, (float*) na_a->ptr,
+    na_b->shape, (float*) na_b->ptr,
+    shift_by );
-  return;
+  return Qnil;
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// To hold the module object
-VALUE Convolver = Qnil;
 /* @overload convolve( signal, kernel )
  * Calculates convolution of an array of floats representing a signal, with a second array representing
@@ -233,10 +105,10 @@ static VALUE narray_convolve( VALUE self, VALUE a, VALUE b ) {
  * Calculates activations of a fully-connected neural network layer. The transfer function after
  * summing weights and applying threshold is a "ReLU", equivalent to
  *  y = x < 0.0 ? 0.0 : x
- * this is less sophisticated than many neural net architectures, but is fast to calculate and to
- * train.
+ * this is less sophisticated than many other neural net functions (such as sigma), but is fast to
+ * calculate and to train.
  * @param [NArray] inputs must be rank 1 array of floats
- * @param [NArray] weights must be rank 2 array of floats, with first rank size of inputs, and second rank equal to number of outputs desired
+ * @param [NArray] weights must be rank 2 array of floats, with first dimension size of inputs, and second dimension size equal to number of outputs
  * @param [NArray] thresholds must be rank 1 array of floats, size equal to number of outputs desired
  * @return [NArray] neuron activations
  */
@@ -266,7 +138,7 @@ static VALUE narray_nn_run_single_layer( VALUE self, VALUE inputs, VALUE weights
   val_thresholds = na_cast_object(thresholds, NA_SFLOAT);
   GetNArray( val_thresholds, na_thresholds );
   if ( na_thresholds->rank != 1 ) {
-    rb_raise( rb_eArgError, "thresholds must be array of rank 1" );
+    rb_raise( rb_eArgError, "thresholds must be narray of rank 1" );
   }
   if ( na_thresholds->shape[0] != output_size ) {
     rb_raise( rb_eArgError, "thresholds expected size %d, but got %d", output_size, na_thresholds->shape[0] );
@@ -287,4 +159,5 @@ void Init_convolver() {
   Convolver = rb_define_module( "Convolver" );
   rb_define_singleton_method( Convolver, "convolve", narray_convolve, 2 );
   rb_define_singleton_method( Convolver, "nn_run_layer", narray_nn_run_single_layer, 3 );
+  rb_define_singleton_method( Convolver, "fit_kernel_backwards", narray_fit_backwards, 2 );
 }

data/ext/convolver/narray_shared.c ADDED

@@ -0,0 +1,42 @@
+// ext/convolver/narray_shared.c
+#include "narray_shared.h"
+// This is copied from na_array.c, with safety checks and temp vars removed
+int na_quick_idxs_to_pos( int rank, int *shape, int *idxs ) {
+  int i, pos = 0;
+  for ( i = rank - 1; i >= 0; i-- ) {
+    pos = pos * shape[i] + idxs[i];
+  }
+  return pos;
+}
+// This is inverse of above
+void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs ) {
+  int i;
+  for ( i = 0; i < rank; i++ ) {
+    idxs[ i ] = pos % shape[i];
+    pos /= shape[i];
+  }
+  return;
+}
+// used to place kernel data into array for FFTW3 processing
+void fit_backwards_raw( int rank, int *dst_shape, float *dst, int *src_shape, float *src, int *shift_shape ) {
+  int i, j, size, x;
+  int k_idx[16], dst_idx[16];
+  size = 1;
+  for ( j = 0; j < rank; j++ ) { size *= src_shape[j]; }
+  for ( i = 0; i < size; i++ ) {
+    na_quick_pos_to_idxs( rank, src_shape, i, k_idx );
+    for ( j = 0; j < rank; j++ ) {
+      x =  src_shape[j] - shift_shape[j] - k_idx[j] - 1;
+      if ( x < 0 ) x = x + dst_shape[j];
+      dst_idx[j] = x;
+    }
+    dst[ na_quick_idxs_to_pos( rank, dst_shape, dst_idx ) ] = src[i];
+  }
+  return;
+}

data/ext/convolver/narray_shared.h ADDED

@@ -0,0 +1,22 @@
+// ext/convolver/narray_shared.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CONVOLVER_NARRAY_SHARED_H
+#define CONVOLVER_NARRAY_SHARED_H
+#include <ruby.h>
+#include "narray.h"
+// This is copied from na_array.c, with safety checks and temp vars removed
+int na_quick_idxs_to_pos( int rank, int *shape, int *idxs );
+// This is inverse of above
+void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs );
+void fit_backwards_raw( int rank, int *dst_shape, float *dst, int *src_shape, float *src, int *shift_shape );
+#endif

data/lib/convolver.rb CHANGED

@@ -1,7 +1,48 @@
 require 'narray'
 require "convolver/convolver"
 require "convolver/version"
+require 'fftw3'
 module Convolver
+  # Uses FFTW3 library to calculate convolution of an array of floats representing a signal,
+  # with a second array representing a kernel. The two parameters must have the same rank.
+  # The output has same rank, its size in each dimension d is given by
+  #  signal.shape[d] - kernel.shape[d] + 1
+  # @param [NArray] signal must be same size or larger than kernel in each dimension
+  # @param [NArray] kernel must be same size or smaller than signal in each dimension
+  # @return [NArray] result of convolving signal with kernel
+  def self.convolve_fftw3 signal, kernel
+    combined_shape, shift_by, ranges = fft_offsets( signal.shape, kernel.shape )
+    mod_a = NArray.sfloat(*combined_shape)
+    mod_a[*shift_by] = signal
+    mod_b = NArray.sfloat(*combined_shape)
+    Convolver.fit_kernel_backwards( mod_b, kernel )
+    afreqs = FFTW3.fft(mod_a)
+    bfreqs = FFTW3.fft(mod_b)
+    cfreqs = afreqs * bfreqs
+    (FFTW3.ifft( cfreqs ).real * (1.0/mod_a.size))[*ranges]
+  end
+  private
+  def self.fft_offsets signal_shape, kernel_shape
+    combined_shape = []
+    shift_by = []
+    ranges = []
+    signal_shape.each_with_index do |signal_size, i|
+      kernel_size = kernel_shape[i]
+      combined_shape[i] = signal_size + kernel_size - 1
+      output_size = signal_size - kernel_size + 1
+      output_offset = kernel_size - 1
+      shift_by[i] = kernel_size / 2
+      ranges[i] = (output_offset...(output_offset + output_size))
+    end
+    [ combined_shape, shift_by, ranges ]
+  end
 end

data/lib/convolver/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Convolver
-  VERSION = "0.0.2"
+  VERSION = "0.1.0"
 end

data/spec/convolve_fftw3_spec.rb ADDED

@@ -0,0 +1,161 @@
+require 'helpers'
+describe Convolver do
+  describe "#convolve_fftw3" do
+    it "should work like the example in the README" do
+      a = NArray[ 0.3, 0.4, 0.5 ]
+      b = NArray[ 1.3, -0.5 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.19, 0.27 ]
+    end
+    it "should convolve 1D arrays with a variety of signal and kernel lengths" do
+      a = NArray[ 0.3 ]
+      b = NArray[ -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.21 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.21, -0.28, -0.35, -0.14 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ 1.1, -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.05, 0.09, 0.41 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ 1.1, -0.7, -0.2 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.05, 0.05 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 1.1, -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.05, 0.09, 0.41, -0.2 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 1.1, -0.7, 2.1 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 1.1, 0.51, 1.67 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 0.6, -0.5, -0.4, 0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.08, 0.33 ]
+    end
+    it "should calculate a 2D convolution" do
+      a = NArray[ [ 0.3, 0.4, 0.5 ], [ 0.6, 0.8, 0.2 ], [ 0.9, 1.0, 0.1 ] ]
+      b = NArray[ [ 1.2, -0.5 ], [ 0.5, -1.3 ] ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ [ -0.58, 0.37 ], [ -0.53, 1.23 ] ]
+    end
+    it "should calculate a 3D convolution" do
+      # 5x4x3
+      a = NArray[
+        [ [ 1.0, 0.6, 1.1, 0.2, 0.9 ], [ 1.0, 0.7, 0.8, 1.0, 1.0 ], [ 0.2, 0.6, 0.1, 0.2, 0.5 ], [ 0.5, 0.9, 0.2, 0.1, 0.6 ] ],
+        [ [ 0.4, 0.9, 0.4, 0.0, 0.6 ], [ 0.2, 1.1, 0.2, 0.4, 0.1 ], [ 0.4, 0.2, 0.5, 0.8, 0.7 ], [ 0.1, 0.9, 0.7, 0.1, 0.3 ] ],
+        [ [ 0.8, 0.6, 1.0, 0.1, 0.4 ], [ 0.3, 0.8, 0.6, 0.7, 1.1 ], [ 0.9, 1.0, 0.3, 0.4, 0.6 ], [ 0.2, 0.5, 0.4, 0.7, 0.2 ] ]
+      ]
+      # 3x3x3
+      b = NArray[
+        [ [ -0.9, 1.2, 0.8  ], [ 0.9, 0.1, -0.5 ], [ 1.1, 0.1, -1.1 ] ],
+        [ [ -0.2, -1.0, 1.4 ], [ -1.4, 0.0, 1.3 ], [ 0.3, 1.0, -0.5 ] ],
+        [ [ 0.6, 0.0, 0.7 ],   [ -0.7, 1.1, 1.2 ], [ 1.3, 0.7, 0.0  ] ]
+      ]
+      # Should be 3x2x1
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ [ [ 5.51, 3.04, 4.3 ], [ 3.04, 6.31, 3.87 ] ] ]
+    end
+    it "should calculate a 4D convolution" do
+      # 3x4x5x3
+      a = NArray[
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ],
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ],
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ] ]
+      # 2x3x3x2
+      b = NArray[ [
+        [ [ 1.1, 0.6 ], [ 1.2, 0.6 ], [ 0.8, 0.1 ] ], [ [ -0.4, 0.8 ], [ 0.5, 0.4 ], [ 1.2, 0.2 ] ],
+        [ [ 0.8, 0.2 ], [ 0.5, 0.0 ], [ 1.4, 1.3 ] ] ],
+        [ [ [ 1.1, 0.6 ], [ 1.2, 0.6 ], [ 0.8, 0.1 ] ], [ [ -0.4, 0.8 ], [ 0.5, 0.4 ], [ 1.2, 0.2 ] ],
+        [ [ 0.8, 0.2 ], [ 0.5, 0.0 ], [ 1.4, 1.3 ] ] ] ]
+      # Should be 2x2x3x2
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[
+        [ [ [ 8.5, 8.2 ], [ 11.34, 9.68 ] ], [ [ 7.68, 6.56 ], [ 11.24, 7.16 ] ], [ [ 9.14, 6.54 ], [ 12.44, 9.2 ] ] ],
+        [ [ [ 8.5, 8.2 ], [ 11.34, 9.68 ] ], [ [ 7.68, 6.56 ], [ 11.24, 7.16 ] ], [ [ 9.14, 6.54 ], [ 12.44, 9.2 ] ] ]
+      ]
+    end
+    describe "compared with #convolve" do
+      it "should produce same results for 1D arrays " do
+        (1..30).each do |signal_length|
+          (1..signal_length).each do |kernel_length|
+            signal = NArray.sfloat(signal_length).random()
+            kernel = NArray.sfloat(kernel_length).random()
+            expect_result = Convolver.convolve( signal, kernel )
+            got_result = Convolver.convolve_fftw3( signal, kernel )
+            got_result.should be_narray_like expect_result
+          end
+        end
+      end
+      it "should produce same results for 2D arrays " do
+        (3..10).each do |signal_x|
+          (signal_x-2..signal_x+2).each do |signal_y|
+            (1..signal_x).each do |kernel_x|
+              (1..signal_y).each do |kernel_y|
+                signal = NArray.sfloat(signal_x,signal_y).random()
+                kernel = NArray.sfloat(kernel_x,kernel_y).random()
+                expect_result = Convolver.convolve( signal, kernel )
+                got_result = Convolver.convolve_fftw3( signal, kernel )
+                got_result.should be_narray_like expect_result
+              end
+            end
+          end
+        end
+      end
+      it "should produce same results for 3D arrays " do
+        (3..5).each do |signal_x|
+          (signal_x-2..signal_x+2).each do |signal_y|
+            (signal_x-2..signal_x+2).each do |signal_z|
+              (1..signal_x).each do |kernel_x|
+                (1..signal_y).each do |kernel_y|
+                  (1..signal_z).each do |kernel_z|
+                    signal = NArray.sfloat(signal_x,signal_y,signal_z).random()
+                    kernel = NArray.sfloat(kernel_x,kernel_y,kernel_z).random()
+                    expect_result = Convolver.convolve( signal, kernel )
+                    got_result = Convolver.convolve_fftw3( signal, kernel )
+                    got_result.should be_narray_like expect_result
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/spec/helpers.rb CHANGED

@@ -12,7 +12,7 @@ RSpec::Matchers.define :be_narray_like do |expected_narray|
     else
       d = given - expected_narray
       difference =  ( d * d ).sum / d.size
-      if difference > 1e-10
+      if difference > 1e-9
         @error = "Numerical difference with mean square error #{difference}"
       end
     end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: convolver
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
 - Neil Slater
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-21 00:00:00.000000000 Z
+date: 2013-10-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: narray
@@ -24,6 +24,20 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: 0.6.0.8
+- !ruby/object:Gem::Dependency
+  name: fftw3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0.3'
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
@@ -112,10 +126,17 @@ files:
 - benchmarks/convolver_vs_fftw3.rb
 - benchmarks/nn_layer_benchmark.rb
 - convolver.gemspec
+- ext/convolver/cnn_components.c
+- ext/convolver/cnn_components.h
+- ext/convolver/convolve_raw.c
+- ext/convolver/convolve_raw.h
 - ext/convolver/convolver.c
 - ext/convolver/extconf.rb
+- ext/convolver/narray_shared.c
+- ext/convolver/narray_shared.h
 - lib/convolver.rb
 - lib/convolver/version.rb
+- spec/convolve_fftw3_spec.rb
 - spec/convolver_spec.rb
 - spec/helpers.rb
 homepage: http://github.com/neilslater/convolver
@@ -143,6 +164,7 @@ signing_key:
 specification_version: 4
 summary: Convolution for NArray
 test_files:
+- spec/convolve_fftw3_spec.rb
 - spec/convolver_spec.rb
 - spec/helpers.rb
 has_rdoc: