RubyGems - convolver - Versions diffs - 0.0.2 → 0.1.0 - Mend

convolver 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -0
data/README.md +18 -5
data/benchmarks/convolve_benchmark.rb +2 -2
data/benchmarks/convolver_vs_fftw3.rb +17 -41
data/convolver.gemspec +1 -0
data/ext/convolver/cnn_components.c +52 -0
data/ext/convolver/cnn_components.h +14 -0
data/ext/convolver/convolve_raw.c +105 -0
data/ext/convolver/convolve_raw.h +22 -0
data/ext/convolver/convolver.c +35 -162
data/ext/convolver/narray_shared.c +42 -0
data/ext/convolver/narray_shared.h +22 -0
data/lib/convolver.rb +41 -0
data/lib/convolver/version.rb +1 -1
data/spec/convolve_fftw3_spec.rb +161 -0
data/spec/helpers.rb +1 -1
metadata +24 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e20ffb64db726dac28c0b1ab5643355a67642fad
-  data.tar.gz: 278c29c4e851edf295c6f0c82043ecab98c42d6f
+  metadata.gz: 4d393cf7a6f7cd94485db0aaed706239c92ab0d0
+  data.tar.gz: 0944542524997227558ceb4f5c9b6968d3413e50
 SHA512:
-  metadata.gz: 3d8aefb6c27bf1777bea0d07f83c096a106278caf66b05519764705cfc783ebe614b71fd7f31adb43ffae7365daf18d8f4cb16cb18d5b83fea231467d571e75f
-  data.tar.gz: 0cf6c1c0fc18781580d512f56b6199f2918d476284f703fc4d2d9409e13787a7d9da22a9440714d04d114b472d5dfdcb55824b263ea3717071f61c4e4aa3714c
+  metadata.gz: 1f263ae8f88d5318f84f9479c2540d25648396d09f5bfce65dcfc13122eaff26207280afa0a459d634408b5a526d6f302ebe750aebdd3106df3c91e7ac8af681
+  data.tar.gz: 35473c019dfb69abf0a0048df1b53f0c12e0512d1ebddc4b4dab4a7ececf7d8d0c35078ad92192d350b7c34bc5c4548a9446a2e5b69b0888c1b0c877445aea8b

data/.travis.yml CHANGED

@@ -1,3 +1,6 @@
+before_install:
+ - sudo apt-get update -qq
+ - sudo apt-get install -qq libfftw3-dev
 language: ruby
 rvm:
   - "1.8.7"

data/README.md CHANGED

@@ -5,9 +5,9 @@
 Adds a convolve operation to NArray floats. It is around 250 times faster than equivalents
 in pure Ruby.
-Note that convolves based on FFTW3 could well be faster still for large arrays with large kernels.
-Benchmark tests suggest that the kernel needs to be a few hundred items, and be significantly smaller
-than the signal before FFTW3 offers a significant advantage.
+The gem makes convolution via FFTW3 library available. This is faster for convolutions with
+larger kernels and signals. The relationship is complex, but as a rule of thumb, the kernel
+needs to be around 1000 entries or larger before it is worth switching to FFTW3-based convolves.
 ## Planned features
@@ -17,6 +17,12 @@ calculating signal convolutions for other types of analysis.
 ## Installation
+### Dependency: FFTW3
+Before you install *convolver*, you should install FFTW3. See http://www.fftw.org/ for details.
+### Installing the gem
 Add this line to your application's Gemfile:
     gem 'convolver'
@@ -41,12 +47,19 @@ Basic convolution:
  * Convolver only works on single-precision floats internally. It will cast NArray types to this, if
 possible, prior to calculating.
- * The convolution is an "inner" one. The output is smaller than the input, each dimension is reduced
-by 1 less than the width of the kernel in the same dimension.
+ * The output is smaller than the input, each dimension is reduced by 1 less than the width of the
+kernel in the same dimension.
  * Convolver expects input a and kernel b to have the same rank, and for the kernel to be same size
 or smaller in all dimensions as the input.
+FFTW3 convolution:
+    a = NArray[0.3,0.4,0.5]
+    b = NArray[1.3, -0.5]
+    c = Convolver.convolve_fftw3( a, b )
+    => NArray.float(2): [ 0.19, 0.27 ]
 ## Contributing
 1. Fork it

data/benchmarks/convolve_benchmark.rb CHANGED

@@ -6,8 +6,8 @@ class Convolver2DBenchmark
   attr_reader :image, :kernel
   def initialize
-    @image = NArray.float(640, 480).random
-    @kernel = NArray.float(8, 8).random
+    @image = NArray.sfloat(640, 480).random
+    @kernel = NArray.sfloat(8, 8).random
   end
 end

data/benchmarks/convolver_vs_fftw3.rb CHANGED

@@ -1,70 +1,46 @@
 require 'convolver'
-require 'narray'
-require 'fftw3'
 require 'benchmark'
-# In Ruby for now, which is slower, but at least gets us ballpark figures (99% of the work is in the C)
-module FFTW3Convolver
-  def self.convolve orig_a, orig_b
-    combined_size = orig_a.size + orig_b.size - 1
-    left_pad_a = ( combined_size - orig_a.size + 1)/2
-    mod_a = NArray.float(combined_size)
-    mod_a[left_pad_a] = orig_a
-    mod_b = NArray.float(combined_size)
-    left_select_b = ( orig_b.size + 1 )/2
-    right_select_b = orig_b.size - left_select_b
-    mod_b[0] = orig_b[(0...left_select_b)].reverse
-    mod_b[-right_select_b] = orig_b[-right_select_b..-1].reverse
-    afft = FFTW3.fft(mod_a)
-    bfft = FFTW3.fft(mod_b)
-    cfft = afft * bfft
-    (FFTW3.ifft( cfft )/combined_size).real[left_pad_a...(left_pad_a+ orig_a.size - orig_b.size + 1)]
-  end
-end
 class Convolver2DBenchmark
   attr_reader :image, :kernel
   def initialize
     # These show Convolver.convolve as 3x faster than FFTW3
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(16 * 16).random
+    @image = NArray.sfloat(256 * 256).random
+    @kernel = NArray.sfloat(16 * 16).random
     # These are roughly even (10% advantage to FFTW3)
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(32 * 32).random
+    #  @image = NArray.sfloat(256 * 256).random
+    #  @kernel = NArray.sfloat(32 * 32).random
     # These show FFTW3 as 4x faster than Convolver.convolve
-    #  @image = NArray.float(256 * 256).random
-    #  @kernel = NArray.float(64 * 64).random
+    #  @image = NArray.sfloat(256 * 256).random
+    #  @kernel = NArray.sfloat(64 * 64).random
     # These show Convolver.convolve as 200x faster than FFTW3
-    # @image = NArray.float(50 * 64 * 64).random
-    # @kernel = NArray.float(50 * 64 * 64).random
+    # @image = NArray.sfloat(50 * 64 * 64).random
+    # @kernel = NArray.sfloat(50 * 64 * 64).random
     # These show FFTW3 as 2x faster than Convolver.convolve
-    # @image = NArray.float(128 * 128).random
-    # @kernel = NArray.float(64 * 64).random
+    # @image = NArray.sfloat(128 * 128).random
+    # @kernel = NArray.sfloat(64 * 64).random
     # These show FFTW3 and Convolver.convolve roughly equal
-    # @image = NArray.float(80 * 80).random
-    # @kernel = NArray.float(64 * 64).random
+    # @image = NArray.sfloat(80 * 80).random
+    # @kernel = NArray.sfloat(64 * 64).random
     # These show FFTW3 as 2x faster than Convolver.convolve
-    # @image = NArray.float(2 * 80 * 80).random
-    # @kernel = NArray.float(2 * 64 * 64).random
+    # @image = NArray.sfloat(2 * 80 * 80).random
+    # @kernel = NArray.sfloat(2 * 64 * 64).random
     # These are roughly even - increasing size of image favours FFTW3
-    @image = NArray.float(2000 + 80 * 80).random
-    @kernel = NArray.float(80 * 80).random
+    #@image = NArray.sfloat(2000 + 80 * 80).random
+    #@kernel = NArray.sfloat(80 * 80).random
   end
 end
 Benchmark.bm do |x|
   source = Convolver2DBenchmark.new
   x.report('convolver') { 100.times { Convolver.convolve( source.image, source.kernel ) } }
-  x.report('fftw3') { 100.times { FFTW3Convolver.convolve( source.image, source.kernel ) } }
+  x.report('fftw3') { 100.times { Convolver.convolve_fftw3( source.image, source.kernel ) } }
 end

data/convolver.gemspec CHANGED

@@ -14,6 +14,7 @@ Gem::Specification.new do |spec|
   spec.license       = "MIT"
   spec.add_dependency "narray", ">= 0.6.0.8"
+  spec.add_dependency "fftw3", ">= 0.3"
   spec.add_development_dependency "yard", ">= 0.8.7.2"
   spec.add_development_dependency "bundler", ">= 1.3"

data/ext/convolver/cnn_components.c ADDED

@@ -0,0 +1,52 @@
+// ext/convolver/cnn_components.c
+#include <xmmintrin.h>
+#include "cnn_components.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Run a single fully-connected layer, calculating output from input
+//
+//    Benchmark: 1024 inputs, 256 outputs. 1000 iterations. 0.56 seconds
+//
+//
+void nn_run_layer_raw( int in_size, int out_size,
+    float *in_ptr, float *weights, float *thresholds, float *out_ptr ) {
+  int i, j, in_aligned_size, out_aligned_size, offset;
+  __m128 simd_x, simd_y, simd_t;
+  in_aligned_size = 4 * ( in_size/4 );
+  out_aligned_size = 4 * ( out_size/4 );
+  // Calculate activation
+  for ( i = 0; i < out_size; i++ ) {
+    float t = 0.0;
+    simd_t = _mm_setzero_ps();
+    offset = i * in_size;
+    // Use SIMD for all the aligned values in groups of 4
+    for ( j = 0; j < in_aligned_size; j +=4 ) {
+      simd_x = _mm_load_ps( in_ptr + j );
+      // Weights might not align to 16 bytes due to size of layers
+      simd_y = _mm_loadu_ps( weights + (offset + j) );
+      simd_x = _mm_mul_ps( simd_x, simd_y );
+      simd_t = _mm_add_ps( simd_x, simd_t );
+    }
+    // Complete any remaining 1,2 or 3 items one at a time
+    for ( j = in_aligned_size; j < in_size; j++ ) {
+      t += in_ptr[ j ] * weights[ offset + j ];
+    }
+    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  }
+  for ( i = 0; i < out_size; i++ ) {
+    out_ptr[i] -= thresholds[i];
+    if ( out_ptr[i] < 0.0 ) { out_ptr[i] = 0.0; }
+  }
+  return;
+}

data/ext/convolver/cnn_components.h ADDED

@@ -0,0 +1,14 @@
+// ext/convolver/cnn_components.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CNN_COMPONENTS_H
+#define CNN_COMPONENTS_H
+void nn_run_layer_raw( int in_size, int out_size,
+    float *in_ptr, float *weights, float *thresholds, float *out_ptr );
+#endif

data/ext/convolver/convolve_raw.c ADDED

@@ -0,0 +1,105 @@
+// ext/convolver/convolve_raw.c
+#include "convolve_raw.h"
+inline int size_from_shape( int rank, int *shape ) {
+  int size = 1;
+  int i;
+  for ( i = 0; i < rank; i++ ) { size *= shape[i]; }
+  return size;
+}
+// Sets reverse indices
+inline void corner_reset( int rank, int *shape, int *rev_indices ) {
+  int i;
+  for ( i = 0; i < rank; i++ ) { rev_indices[i] = shape[i] - 1; }
+  return;
+}
+// Counts indices down, returns number of ranks that reset
+inline int corner_dec( int rank, int *shape, int *rev_indices ) {
+  int i = 0;
+  while ( ! rev_indices[i]-- ) {
+    rev_indices[i] = shape[i] - 1;
+    i++;
+  }
+  return i;
+}
+// Generates co-increment steps by rank boundaries crossed, for the outer position as inner position is incremented by 1
+inline void calc_co_increment( int rank, int *outer_shape, int *inner_shape, int *co_increment ) {
+  int i, factor;
+  co_increment[0] = 1; // co-increment is always 1 in lowest rank
+  factor = 1;
+  for ( i = 0; i < rank; i++ ) {
+    co_increment[i+1] = co_increment[i] + factor * ( outer_shape[i] - inner_shape[i] );
+    factor *= outer_shape[i];
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Convolve
+//
+//    Benchmark: 640x480 image, 8x8 kernel, 1000 iterations. 12.3 seconds.
+//
+void convolve_raw(
+    int in_rank, int *in_shape, float *in_ptr,
+    int kernel_rank, int *kernel_shape, float *kernel_ptr,
+    int out_rank, int *out_shape, float *out_ptr ) {
+  int i, j, in_size, kernel_size, kernel_aligned, out_size, offset;
+  int out_co_incr[LARGEST_RANK], kernel_co_incr[LARGEST_RANK];
+  int ker_q[LARGEST_RANK], out_q[LARGEST_RANK];
+  int *kernel_co_incr_cache;
+  in_size = size_from_shape( in_rank, in_shape );
+  kernel_size = size_from_shape( kernel_rank, kernel_shape );
+  kernel_aligned = 4 * (kernel_size/4);
+  out_size = size_from_shape( out_rank, out_shape );
+  calc_co_increment( in_rank, in_shape, out_shape, out_co_incr );
+  calc_co_increment( in_rank, in_shape, kernel_shape, kernel_co_incr );
+  kernel_co_incr_cache = ALLOC_N( int, kernel_size );
+  kernel_co_incr_cache[0] = 0;
+  corner_reset( kernel_rank, kernel_shape, ker_q );
+  for ( i = 1; i < kernel_size; i++ ) {
+    kernel_co_incr_cache[i] = kernel_co_incr_cache[i-1] + kernel_co_incr[ corner_dec( kernel_rank, kernel_shape, ker_q  ) ];
+  }
+  // For convenience of flow, we set offset to -1 and adjust countdown 1 higher to compensate
+  offset = -1;
+  corner_reset( out_rank, out_shape, out_q );
+  out_q[0]++;
+  // Main convolve loop
+  for ( i = 0; i < out_size; i++ ) {
+    __m128 simd_x, simd_y, simd_t;
+    float t = 0.0;
+    simd_t = _mm_setzero_ps();
+    offset += out_co_incr[ corner_dec( out_rank, out_shape, out_q ) ];
+    // Use SIMD for all the aligned values in groups of 4
+    for ( j = 0; j < kernel_aligned; j +=4 ) {
+      simd_x = _mm_load_ps( kernel_ptr + j );
+      // Yes the backwards alignment is correct
+      simd_y = _mm_set_ps( in_ptr[ offset + kernel_co_incr_cache[j+3] ], in_ptr[ offset + kernel_co_incr_cache[j+2] ],
+                           in_ptr[ offset + kernel_co_incr_cache[j+1] ], in_ptr[ offset + kernel_co_incr_cache[j] ] );
+      simd_x = _mm_mul_ps( simd_x, simd_y );
+      simd_t = _mm_add_ps( simd_x, simd_t );
+    }
+    // Complete any remaining 1,2 or 3 items one at a time
+    for ( j = kernel_aligned; j < kernel_size; j++ ) {
+      t += in_ptr[ offset + kernel_co_incr_cache[j] ] * kernel_ptr[ j ];
+    }
+    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  }
+  xfree( kernel_co_incr_cache );
+  return;
+}

data/ext/convolver/convolve_raw.h ADDED

@@ -0,0 +1,22 @@
+// ext/convolver/convolve_raw.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CONVOLVE_RAW_H
+#define CONVOLVE_RAW_H
+#include <ruby.h>
+#include <xmmintrin.h>
+#include "narray_shared.h"
+#define LARGEST_RANK 16
+void convolve_raw(
+    int in_rank, int *in_shape, float *in_ptr,
+    int kernel_rank, int *kernel_shape, float *kernel_ptr,
+    int out_rank, int *out_shape, float *out_ptr );
+#endif

data/ext/convolver/convolver.c CHANGED

@@ -5,181 +5,53 @@
 #include <stdio.h>
 #include <xmmintrin.h>
-#define LARGEST_RANK 16
+#include "narray_shared.h"
+#include "convolve_raw.h"
+#include "cnn_components.h"
-// This is copied from na_array.c, with safety checks and temp vars removed
-inline int na_quick_idxs_to_pos( int rank, int *shape, int *idxs ) {
-  int i, pos = 0;
-  for ( i = rank - 1; i >= 0; i-- ) {
-    pos = pos * shape[i] + idxs[i];
-  }
-  return pos;
-}
-// This is inverse of above
-inline void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs ) {
-  int i;
-  for ( i = 0; i < rank; i++ ) {
-    idxs[ i ] = pos % shape[i];
-    pos /= shape[i];
-  }
-  return;
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
-inline int size_from_shape( int rank, int *shape ) {
-  int size = 1;
-  int i;
-  for ( i = 0; i < rank; i++ ) { size *= shape[i]; }
-  return size;
-}
+// To hold the module object
+VALUE Convolver = Qnil;
-// Sets reverse indices
-inline void corner_reset( int rank, int *shape, int *rev_indices ) {
-  int i;
-  for ( i = 0; i < rank; i++ ) { rev_indices[i] = shape[i] - 1; }
-  return;
-}
+static VALUE narray_fit_backwards( VALUE self, VALUE a, VALUE b ) {
+  struct NARRAY *na_a, *na_b;
+  volatile VALUE val_a, val_b;
+  int target_rank, i;
+  int shift_by[LARGEST_RANK];
-// Counts indices down, returns number of ranks that reset
-inline int corner_dec( int rank, int *shape, int *rev_indices ) {
-  int i = 0;
-  while ( ! rev_indices[i]-- ) {
-    rev_indices[i] = shape[i] - 1;
-    i++;
-  }
-  return i;
-}
+  val_a = na_cast_object(a, NA_SFLOAT);
+  GetNArray( val_a, na_a );
-// Generates co-increment steps by rank boundaries crossed, for the outer position as inner position is incremented by 1
-inline void calc_co_increment( int rank, int *outer_shape, int *inner_shape, int *co_increment ) {
-  int i, factor;
-  co_increment[0] = 1; // co-increment is always 1 in lowest rank
-  factor = 1;
-  for ( i = 0; i < rank; i++ ) {
-    co_increment[i+1] = co_increment[i] + factor * ( outer_shape[i] - inner_shape[i] );
-    factor *= outer_shape[i];
-  }
-}
+  val_b = na_cast_object(b, NA_SFLOAT);
+  GetNArray( val_b, na_b );
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  Convolve
-//
-//    Benchmark: 640x480 image, 8x8 kernel, 1000 iterations. 12.3 seconds.
-//
-void convolve_raw(
-    int in_rank, int *in_shape, float *in_ptr,
-    int kernel_rank, int *kernel_shape, float *kernel_ptr,
-    int out_rank, int *out_shape, float *out_ptr ) {
-  int i, j, in_size, kernel_size, kernel_aligned, out_size, offset;
-  int out_co_incr[LARGEST_RANK], kernel_co_incr[LARGEST_RANK];
-  int ker_q[LARGEST_RANK], out_q[LARGEST_RANK];
-  int *kernel_co_incr_cache;
-  in_size = size_from_shape( in_rank, in_shape );
-  kernel_size = size_from_shape( kernel_rank, kernel_shape );
-  kernel_aligned = 4 * (kernel_size/4);
-  out_size = size_from_shape( out_rank, out_shape );
-  calc_co_increment( in_rank, in_shape, out_shape, out_co_incr );
-  calc_co_increment( in_rank, in_shape, kernel_shape, kernel_co_incr );
-  kernel_co_incr_cache = ALLOC_N( int, kernel_size );
-  kernel_co_incr_cache[0] = 0;
-  corner_reset( kernel_rank, kernel_shape, ker_q );
-  for ( i = 1; i < kernel_size; i++ ) {
-    kernel_co_incr_cache[i] = kernel_co_incr_cache[i-1] + kernel_co_incr[ corner_dec( kernel_rank, kernel_shape, ker_q  ) ];
+  if ( na_a->rank != na_b->rank ) {
+    rb_raise( rb_eArgError, "narray a must have equal rank to narray b (a rank %d, b rank %d)", na_a->rank,  na_b->rank );
   }
-  // For convenience of flow, we set offset to -1 and adjust countdown 1 higher to compensate
-  offset = -1;
-  corner_reset( out_rank, out_shape, out_q );
-  out_q[0]++;
-  // Main convolve loop
-  for ( i = 0; i < out_size; i++ ) {
-    __m128 simd_x, simd_y, simd_t;
-    float t = 0.0;
-    simd_t = _mm_setzero_ps();
-    offset += out_co_incr[ corner_dec( out_rank, out_shape, out_q ) ];
-    // Use SIMD for all the aligned values in groups of 4
-    for ( j = 0; j < kernel_aligned; j +=4 ) {
-      simd_x = _mm_load_ps( kernel_ptr + j );
-      // Yes the backwards alignment is correct
-      simd_y = _mm_set_ps( in_ptr[ offset + kernel_co_incr_cache[j+3] ], in_ptr[ offset + kernel_co_incr_cache[j+2] ],
-                           in_ptr[ offset + kernel_co_incr_cache[j+1] ], in_ptr[ offset + kernel_co_incr_cache[j] ] );
-      simd_x = _mm_mul_ps( simd_x, simd_y );
-      simd_t = _mm_add_ps( simd_x, simd_t );
-    }
-    // Complete any remaining 1,2 or 3 items one at a time
-    for ( j = kernel_aligned; j < kernel_size; j++ ) {
-      t += in_ptr[ offset + kernel_co_incr_cache[j] ] * kernel_ptr[ j ];
-    }
-    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+  if ( na_a->rank > LARGEST_RANK ) {
+    rb_raise( rb_eArgError, "exceeded maximum narray rank for convolve of %d", LARGEST_RANK );
   }
-  xfree( kernel_co_incr_cache );
-  return;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//  Neural net
-//
-//    Benchmark: 1024 inputs, 256 outputs. 1000 iterations. 0.56 seconds
-//
-//
-void nn_run_layer_raw( int in_size, int out_size,
-    float *in_ptr, float *weights, float *thresholds, float *out_ptr ) {
-  int i, j, in_aligned_size, out_aligned_size, offset;
-  __m128 simd_x, simd_y, simd_t;
-  in_aligned_size = 4 * ( in_size/4 );
-  out_aligned_size = 4 * ( out_size/4 );
-  // Calculate activation
-  for ( i = 0; i < out_size; i++ ) {
-    float t = 0.0;
-    simd_t = _mm_setzero_ps();
-    offset = i * in_size;
-    // Use SIMD for all the aligned values in groups of 4
-    for ( j = 0; j < in_aligned_size; j +=4 ) {
-      simd_x = _mm_load_ps( in_ptr + j );
-      // Weights might not align to 16 bytes due to size of layers
-      simd_y = _mm_loadu_ps( weights + (offset + j) );
-      simd_x = _mm_mul_ps( simd_x, simd_y );
-      simd_t = _mm_add_ps( simd_x, simd_t );
-    }
+  target_rank = na_a->rank;
-    // Complete any remaining 1,2 or 3 items one at a time
-    for ( j = in_aligned_size; j < in_size; j++ ) {
-      t += in_ptr[ j ] * weights[ offset + j ];
+  for ( i = 0; i < target_rank; i++ ) {
+    if ( ( na_a->shape[i] - na_b->shape[i] ) < 0 ) {
+      rb_raise( rb_eArgError, "no space for backward fit" );
     }
-    out_ptr[i] = simd_t[0] + simd_t[1] + simd_t[2] + simd_t[3] + t;
+    shift_by[i] = na_b->shape[i] >> 1;
   }
-  for ( i = 0; i < out_size; i++ ) {
-    out_ptr[i] -= thresholds[i];
-    if ( out_ptr[i] < 0.0 ) { out_ptr[i] = 0.0; }
-  }
+  fit_backwards_raw(
+    target_rank,
+    na_a->shape, (float*) na_a->ptr,
+    na_b->shape, (float*) na_b->ptr,
+    shift_by );
-  return;
+  return Qnil;
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// To hold the module object
-VALUE Convolver = Qnil;
 /* @overload convolve( signal, kernel )
  * Calculates convolution of an array of floats representing a signal, with a second array representing
@@ -233,10 +105,10 @@ static VALUE narray_convolve( VALUE self, VALUE a, VALUE b ) {
  * Calculates activations of a fully-connected neural network layer. The transfer function after
  * summing weights and applying threshold is a "ReLU", equivalent to
  *  y = x < 0.0 ? 0.0 : x
- * this is less sophisticated than many neural net architectures, but is fast to calculate and to
- * train.
+ * this is less sophisticated than many other neural net functions (such as sigma), but is fast to
+ * calculate and to train.
  * @param [NArray] inputs must be rank 1 array of floats
- * @param [NArray] weights must be rank 2 array of floats, with first rank size of inputs, and second rank equal to number of outputs desired
+ * @param [NArray] weights must be rank 2 array of floats, with first dimension size of inputs, and second dimension size equal to number of outputs
  * @param [NArray] thresholds must be rank 1 array of floats, size equal to number of outputs desired
  * @return [NArray] neuron activations
  */
@@ -266,7 +138,7 @@ static VALUE narray_nn_run_single_layer( VALUE self, VALUE inputs, VALUE weights
   val_thresholds = na_cast_object(thresholds, NA_SFLOAT);
   GetNArray( val_thresholds, na_thresholds );
   if ( na_thresholds->rank != 1 ) {
-    rb_raise( rb_eArgError, "thresholds must be array of rank 1" );
+    rb_raise( rb_eArgError, "thresholds must be narray of rank 1" );
   }
   if ( na_thresholds->shape[0] != output_size ) {
     rb_raise( rb_eArgError, "thresholds expected size %d, but got %d", output_size, na_thresholds->shape[0] );
@@ -287,4 +159,5 @@ void Init_convolver() {
   Convolver = rb_define_module( "Convolver" );
   rb_define_singleton_method( Convolver, "convolve", narray_convolve, 2 );
   rb_define_singleton_method( Convolver, "nn_run_layer", narray_nn_run_single_layer, 3 );
+  rb_define_singleton_method( Convolver, "fit_kernel_backwards", narray_fit_backwards, 2 );
 }

data/ext/convolver/narray_shared.c ADDED

@@ -0,0 +1,42 @@
+// ext/convolver/narray_shared.c
+#include "narray_shared.h"
+// This is copied from na_array.c, with safety checks and temp vars removed
+int na_quick_idxs_to_pos( int rank, int *shape, int *idxs ) {
+  int i, pos = 0;
+  for ( i = rank - 1; i >= 0; i-- ) {
+    pos = pos * shape[i] + idxs[i];
+  }
+  return pos;
+}
+// This is inverse of above
+void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs ) {
+  int i;
+  for ( i = 0; i < rank; i++ ) {
+    idxs[ i ] = pos % shape[i];
+    pos /= shape[i];
+  }
+  return;
+}
+// used to place kernel data into array for FFTW3 processing
+void fit_backwards_raw( int rank, int *dst_shape, float *dst, int *src_shape, float *src, int *shift_shape ) {
+  int i, j, size, x;
+  int k_idx[16], dst_idx[16];
+  size = 1;
+  for ( j = 0; j < rank; j++ ) { size *= src_shape[j]; }
+  for ( i = 0; i < size; i++ ) {
+    na_quick_pos_to_idxs( rank, src_shape, i, k_idx );
+    for ( j = 0; j < rank; j++ ) {
+      x =  src_shape[j] - shift_shape[j] - k_idx[j] - 1;
+      if ( x < 0 ) x = x + dst_shape[j];
+      dst_idx[j] = x;
+    }
+    dst[ na_quick_idxs_to_pos( rank, dst_shape, dst_idx ) ] = src[i];
+  }
+  return;
+}

data/ext/convolver/narray_shared.h ADDED

@@ -0,0 +1,22 @@
+// ext/convolver/narray_shared.h
+////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Declarations of narray helper functions
+//
+#ifndef CONVOLVER_NARRAY_SHARED_H
+#define CONVOLVER_NARRAY_SHARED_H
+#include <ruby.h>
+#include "narray.h"
+// This is copied from na_array.c, with safety checks and temp vars removed
+int na_quick_idxs_to_pos( int rank, int *shape, int *idxs );
+// This is inverse of above
+void na_quick_pos_to_idxs( int rank, int *shape, int pos, int *idxs );
+void fit_backwards_raw( int rank, int *dst_shape, float *dst, int *src_shape, float *src, int *shift_shape );
+#endif

data/lib/convolver.rb CHANGED

@@ -1,7 +1,48 @@
 require 'narray'
 require "convolver/convolver"
 require "convolver/version"
+require 'fftw3'
 module Convolver
+  # Uses FFTW3 library to calculate convolution of an array of floats representing a signal,
+  # with a second array representing a kernel. The two parameters must have the same rank.
+  # The output has same rank, its size in each dimension d is given by
+  #  signal.shape[d] - kernel.shape[d] + 1
+  # @param [NArray] signal must be same size or larger than kernel in each dimension
+  # @param [NArray] kernel must be same size or smaller than signal in each dimension
+  # @return [NArray] result of convolving signal with kernel
+  def self.convolve_fftw3 signal, kernel
+    combined_shape, shift_by, ranges = fft_offsets( signal.shape, kernel.shape )
+    mod_a = NArray.sfloat(*combined_shape)
+    mod_a[*shift_by] = signal
+    mod_b = NArray.sfloat(*combined_shape)
+    Convolver.fit_kernel_backwards( mod_b, kernel )
+    afreqs = FFTW3.fft(mod_a)
+    bfreqs = FFTW3.fft(mod_b)
+    cfreqs = afreqs * bfreqs
+    (FFTW3.ifft( cfreqs ).real * (1.0/mod_a.size))[*ranges]
+  end
+  private
+  def self.fft_offsets signal_shape, kernel_shape
+    combined_shape = []
+    shift_by = []
+    ranges = []
+    signal_shape.each_with_index do |signal_size, i|
+      kernel_size = kernel_shape[i]
+      combined_shape[i] = signal_size + kernel_size - 1
+      output_size = signal_size - kernel_size + 1
+      output_offset = kernel_size - 1
+      shift_by[i] = kernel_size / 2
+      ranges[i] = (output_offset...(output_offset + output_size))
+    end
+    [ combined_shape, shift_by, ranges ]
+  end
 end

data/lib/convolver/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Convolver
-  VERSION = "0.0.2"
+  VERSION = "0.1.0"
 end

data/spec/convolve_fftw3_spec.rb ADDED

@@ -0,0 +1,161 @@
+require 'helpers'
+describe Convolver do
+  describe "#convolve_fftw3" do
+    it "should work like the example in the README" do
+      a = NArray[ 0.3, 0.4, 0.5 ]
+      b = NArray[ 1.3, -0.5 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.19, 0.27 ]
+    end
+    it "should convolve 1D arrays with a variety of signal and kernel lengths" do
+      a = NArray[ 0.3 ]
+      b = NArray[ -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.21 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.21, -0.28, -0.35, -0.14 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ 1.1, -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.05, 0.09, 0.41 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2 ]
+      b = NArray[ 1.1, -0.7, -0.2 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.05, 0.05 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 1.1, -0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 0.05, 0.09, 0.41, -0.2 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 1.1, -0.7, 2.1 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ 1.1, 0.51, 1.67 ]
+      a = NArray[ 0.3, 0.4, 0.5, 0.2, 0.6 ]
+      b = NArray[ 0.6, -0.5, -0.4, 0.7 ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ -0.08, 0.33 ]
+    end
+    it "should calculate a 2D convolution" do
+      a = NArray[ [ 0.3, 0.4, 0.5 ], [ 0.6, 0.8, 0.2 ], [ 0.9, 1.0, 0.1 ] ]
+      b = NArray[ [ 1.2, -0.5 ], [ 0.5, -1.3 ] ]
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ [ -0.58, 0.37 ], [ -0.53, 1.23 ] ]
+    end
+    it "should calculate a 3D convolution" do
+      # 5x4x3
+      a = NArray[
+        [ [ 1.0, 0.6, 1.1, 0.2, 0.9 ], [ 1.0, 0.7, 0.8, 1.0, 1.0 ], [ 0.2, 0.6, 0.1, 0.2, 0.5 ], [ 0.5, 0.9, 0.2, 0.1, 0.6 ] ],
+        [ [ 0.4, 0.9, 0.4, 0.0, 0.6 ], [ 0.2, 1.1, 0.2, 0.4, 0.1 ], [ 0.4, 0.2, 0.5, 0.8, 0.7 ], [ 0.1, 0.9, 0.7, 0.1, 0.3 ] ],
+        [ [ 0.8, 0.6, 1.0, 0.1, 0.4 ], [ 0.3, 0.8, 0.6, 0.7, 1.1 ], [ 0.9, 1.0, 0.3, 0.4, 0.6 ], [ 0.2, 0.5, 0.4, 0.7, 0.2 ] ]
+      ]
+      # 3x3x3
+      b = NArray[
+        [ [ -0.9, 1.2, 0.8  ], [ 0.9, 0.1, -0.5 ], [ 1.1, 0.1, -1.1 ] ],
+        [ [ -0.2, -1.0, 1.4 ], [ -1.4, 0.0, 1.3 ], [ 0.3, 1.0, -0.5 ] ],
+        [ [ 0.6, 0.0, 0.7 ],   [ -0.7, 1.1, 1.2 ], [ 1.3, 0.7, 0.0  ] ]
+      ]
+      # Should be 3x2x1
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[ [ [ 5.51, 3.04, 4.3 ], [ 3.04, 6.31, 3.87 ] ] ]
+    end
+    it "should calculate a 4D convolution" do
+      # 3x4x5x3
+      a = NArray[
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ],
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ],
+        [ [ [ 0.5, 0.4, 0.9 ], [ 0.1, 0.9, 0.8 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ],
+          [ [ 0.0, 0.4, 0.0 ], [ 0.2, 0.3, 0.8 ], [ 0.6, 0.3, 0.2 ], [ 0.7, 0.4, 0.3 ] ],
+          [ [ 0.3, 0.3, 0.1 ], [ 0.6, 0.9, 0.4 ], [ 0.4, 0.0, 0.1 ], [ 0.8, 0.3, 0.4 ] ] ] ]
+      # 2x3x3x2
+      b = NArray[ [
+        [ [ 1.1, 0.6 ], [ 1.2, 0.6 ], [ 0.8, 0.1 ] ], [ [ -0.4, 0.8 ], [ 0.5, 0.4 ], [ 1.2, 0.2 ] ],
+        [ [ 0.8, 0.2 ], [ 0.5, 0.0 ], [ 1.4, 1.3 ] ] ],
+        [ [ [ 1.1, 0.6 ], [ 1.2, 0.6 ], [ 0.8, 0.1 ] ], [ [ -0.4, 0.8 ], [ 0.5, 0.4 ], [ 1.2, 0.2 ] ],
+        [ [ 0.8, 0.2 ], [ 0.5, 0.0 ], [ 1.4, 1.3 ] ] ] ]
+      # Should be 2x2x3x2
+      c = Convolver.convolve_fftw3( a, b )
+      c.should be_narray_like NArray[
+        [ [ [ 8.5, 8.2 ], [ 11.34, 9.68 ] ], [ [ 7.68, 6.56 ], [ 11.24, 7.16 ] ], [ [ 9.14, 6.54 ], [ 12.44, 9.2 ] ] ],
+        [ [ [ 8.5, 8.2 ], [ 11.34, 9.68 ] ], [ [ 7.68, 6.56 ], [ 11.24, 7.16 ] ], [ [ 9.14, 6.54 ], [ 12.44, 9.2 ] ] ]
+      ]
+    end
+    describe "compared with #convolve" do
+      it "should produce same results for 1D arrays " do
+        (1..30).each do |signal_length|
+          (1..signal_length).each do |kernel_length|
+            signal = NArray.sfloat(signal_length).random()
+            kernel = NArray.sfloat(kernel_length).random()
+            expect_result = Convolver.convolve( signal, kernel )
+            got_result = Convolver.convolve_fftw3( signal, kernel )
+            got_result.should be_narray_like expect_result
+          end
+        end
+      end
+      it "should produce same results for 2D arrays " do
+        (3..10).each do |signal_x|
+          (signal_x-2..signal_x+2).each do |signal_y|
+            (1..signal_x).each do |kernel_x|
+              (1..signal_y).each do |kernel_y|
+                signal = NArray.sfloat(signal_x,signal_y).random()
+                kernel = NArray.sfloat(kernel_x,kernel_y).random()
+                expect_result = Convolver.convolve( signal, kernel )
+                got_result = Convolver.convolve_fftw3( signal, kernel )
+                got_result.should be_narray_like expect_result
+              end
+            end
+          end
+        end
+      end
+      it "should produce same results for 3D arrays " do
+        (3..5).each do |signal_x|
+          (signal_x-2..signal_x+2).each do |signal_y|
+            (signal_x-2..signal_x+2).each do |signal_z|
+              (1..signal_x).each do |kernel_x|
+                (1..signal_y).each do |kernel_y|
+                  (1..signal_z).each do |kernel_z|
+                    signal = NArray.sfloat(signal_x,signal_y,signal_z).random()
+                    kernel = NArray.sfloat(kernel_x,kernel_y,kernel_z).random()
+                    expect_result = Convolver.convolve( signal, kernel )
+                    got_result = Convolver.convolve_fftw3( signal, kernel )
+                    got_result.should be_narray_like expect_result
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/spec/helpers.rb CHANGED

@@ -12,7 +12,7 @@ RSpec::Matchers.define :be_narray_like do |expected_narray|
     else
       d = given - expected_narray
       difference =  ( d * d ).sum / d.size
-      if difference > 1e-10
+      if difference > 1e-9
         @error = "Numerical difference with mean square error #{difference}"
       end
     end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: convolver
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
 - Neil Slater
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-21 00:00:00.000000000 Z
+date: 2013-10-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: narray
@@ -24,6 +24,20 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: 0.6.0.8
+- !ruby/object:Gem::Dependency
+  name: fftw3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0.3'
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
@@ -112,10 +126,17 @@ files:
 - benchmarks/convolver_vs_fftw3.rb
 - benchmarks/nn_layer_benchmark.rb
 - convolver.gemspec
+- ext/convolver/cnn_components.c
+- ext/convolver/cnn_components.h
+- ext/convolver/convolve_raw.c
+- ext/convolver/convolve_raw.h
 - ext/convolver/convolver.c
 - ext/convolver/extconf.rb
+- ext/convolver/narray_shared.c
+- ext/convolver/narray_shared.h
 - lib/convolver.rb
 - lib/convolver/version.rb
+- spec/convolve_fftw3_spec.rb
 - spec/convolver_spec.rb
 - spec/helpers.rb
 homepage: http://github.com/neilslater/convolver
@@ -143,6 +164,7 @@ signing_key:
 specification_version: 4
 summary: Convolution for NArray
 test_files:
+- spec/convolve_fftw3_spec.rb
 - spec/convolver_spec.rb
 - spec/helpers.rb
 has_rdoc: