RubyGems - svm_helper - Versions diffs - 0.1.1 → 0.2.1 - Mend

svm_helper 0.1.1 → 0.2.1

Files changed (33) hide show

checksums.yaml +7 -0
data/Gemfile +1 -0
data/Guardfile +1 -1
data/lib/svm_helper.rb +1 -2
data/lib/svm_helper/parallel_helper.rb +24 -0
data/lib/svm_helper/preprocessors.rb +1 -0
data/lib/svm_helper/preprocessors/simple.rb +31 -22
data/lib/svm_helper/preprocessors/stemming.rb +31 -0
data/lib/svm_helper/selectors.rb +4 -2
data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
data/lib/svm_helper/selectors/bns_ig.rb +50 -0
data/lib/svm_helper/selectors/calc.rb +71 -0
data/lib/svm_helper/selectors/information_gain.rb +49 -0
data/lib/svm_helper/selectors/simple.rb +80 -40
data/lib/svm_helper/stopwords/fr +124 -0
data/lib/svm_helper/version.rb +1 -1
data/spec/factories.rb +4 -3
data/spec/support/selector_spec.rb +2 -4
data/spec/svm_helper/parallel_helper_spec.rb +17 -0
data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
data/spec/svm_helper/selectors/calc_spec.rb +42 -0
data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
data/svm_helper.gemspec +1 -1
metadata +39 -32
data/lib/svm_helper/selectors/n_gram.rb +0 -31
data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
+  data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
+SHA512:
+  metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
+  data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8

data/Gemfile CHANGED

@@ -21,4 +21,5 @@ group :test do
   gem 'rake'
   gem 'mocha', require: 'mocha/api'
   gem 'factory_girl', '~> 4.0'
+  gem 'parallel', require: false
 end

data/Guardfile CHANGED

@@ -1,7 +1,7 @@
 guard 'rspec', cli: "--color --format p", all_after_pass: false do
 # guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
   watch(%r{^spec/.+_spec\.rb$})
-  watch(%r{^lib/svm_helper/(.+)\.rb$})               { |m| "spec/#{m[1]}_spec.rb" }
+  watch(%r{^lib/(.+)\.rb$})               { |m| "spec/#{m[1]}_spec.rb" }
   watch('spec/spec_helper.rb')            { 'spec' }
   watch('spec/factories.rb')              { 'spec' }
   watch(%r{^spec/factories/(.+)\.rb})     { 'spec' }

data/lib/svm_helper.rb CHANGED

@@ -1,8 +1,7 @@
 require "svm_helper/version"
-require 'parallel'
+require "svm_helper/parallel_helper"
 require "svm_helper/preprocessed_data"
 require "svm_helper/feature_vector"
 require "svm_helper/preprocessors"
 require "svm_helper/selectors"

data/lib/svm_helper/parallel_helper.rb ADDED

@@ -0,0 +1,24 @@
+module ParallelHelper
+  THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+  def p_map_with_index data, &block
+    if parallel? && RUBY_PLATFORM == 'java'
+      Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
+    elsif parallel?
+      Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
+    else
+      data.map.with_index {|e,i| yield e,i }
+    end
+  end
+  def p_map data, &block
+    if parallel? && RUBY_PLATFORM == 'java'
+      Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
+    elsif parallel?
+      Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
+    else
+      data.map {|e| yield e }
+    end
+  end
+  def parallel?
+    defined?(Parallel) == 'constant' && @parallel
+  end
+end

data/lib/svm_helper/preprocessors.rb CHANGED

@@ -1,2 +1,3 @@
 require_relative 'preprocessors/simple'
+require_relative 'preprocessors/stemming'
 require_relative 'preprocessors/id_mapping'

data/lib/svm_helper/preprocessors/simple.rb CHANGED

@@ -6,7 +6,7 @@ module Preprocessor
   # @author Andreas Eger
   #
   class Simple
-    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+    include ::ParallelHelper
     # filters most gender stuff
     GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
     # filters most wierd symbols
@@ -25,8 +25,16 @@ module Preprocessor
     # filter for used job tokens
     CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
+    # stopword file
+    #TODO use File.expand_path
+    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    attr_accessor :language
     def initialize args={}
+      @language = args.fetch(:language){'en'}
       @parallel = args.fetch(:parallel){false}
+      @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
     end
     def label
@@ -48,12 +56,20 @@ module Preprocessor
     # @return [Array<PreprocessedData>] list of processed job data - or singe job data
     def process jobs
       if jobs.is_a? Array
-        process_jobs jobs
+        p_map(jobs) {|job| process_job job }
       else
         process_job jobs
       end
     end
+    #
+    # loads a txt file with stop words
+    # @param  location String folder with stopword lists
+    #
+    # @return [Array<String>] Array of stopwords
+    def strip_stopwords(text)
+      (text.split - @stopwords).delete_if { |e| e.size <= 2 }
+    end
     #
     # converts string into a cleaner version
@@ -75,29 +91,22 @@ module Preprocessor
     #
     # @return [String] clean and lowercase version of input
     def clean_description desc
-      desc.gsub(XML_TAG_FILTER,' ')
-          .gsub(EMAIL_FILTER,'')
-          .gsub(URL_FILTER,'')
-          .gsub(GENDER_FILTER,'')
-          .gsub(NEW_LINES,'')
-          .gsub(SYMBOL_FILTER,' ')
-          .gsub(WHITESPACE,' ')
-          .gsub(WORDS_IN_BRACKETS, '\1')
-          .gsub(CODE_TOKEN_FILTER,'')
-          .downcase
-          .strip
+      strip_stopwords(
+        desc.gsub(XML_TAG_FILTER,' ')
+            .gsub(EMAIL_FILTER,'')
+            .gsub(URL_FILTER,'')
+            .gsub(GENDER_FILTER,'')
+            .gsub(NEW_LINES,'')
+            .gsub(SYMBOL_FILTER,' ')
+            .gsub(WHITESPACE,' ')
+            .gsub(WORDS_IN_BRACKETS, '\1')
+            .gsub(CODE_TOKEN_FILTER,'')
+            .downcase
+            .strip
+        )
     end
     private
-    def process_jobs jobs
-      if @parallel && RUBY_PLATFORM == 'java'
-        Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
-      elsif @parallel
-        Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
-      else
-        jobs.map {|job| process_job job }
-      end
-    end
     def process_job job
       PreprocessedData.new(

data/lib/svm_helper/preprocessors/stemming.rb ADDED

@@ -0,0 +1,31 @@
+require_relative 'simple'
+require 'lingua/stemmer'
+module Preprocessor
+  #
+  # Preprocessor Base Class
+  #
+  # @author Andreas Eger
+  #
+  class Stemming < Simple
+    def initialize(args={})
+      super
+      @stemmer = Lingua::Stemmer.new(language: @language)
+    end
+    def label
+      "with_stemming"
+    end
+    def clean_description desc
+      super.map{|w| @stemmer.stem(w) }
+    end
+    private
+    def process_job job
+      PreprocessedData.new(
+        data: [clean_title(job[:title]), clean_description(job[:description])],
+        id: job[:id],
+        label: job[:label]
+      )
+    end
+  end
+end

data/lib/svm_helper/selectors.rb CHANGED

@@ -1,3 +1,5 @@
 require_relative 'selectors/simple'
-require_relative 'selectors/n_gram'
-require_relative 'selectors/with_binary_encoding'
+require_relative 'selectors/calc'
+require_relative 'selectors/bi_normal_seperation'
+require_relative 'selectors/information_gain'
+require_relative 'selectors/bns_ig'

data/lib/svm_helper/selectors/bi_normal_seperation.rb ADDED

@@ -0,0 +1,86 @@
+require_relative 'simple'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class BiNormalSeperation < Selector::Simple
+    include BNS
+    def label
+      "BiNormalSeperation"
+    end
+    def initialize classification, args={}
+      super
+      @word_selection = args.fetch(:word_selection){ :grams1_2 }
+    end
+    #
+    # generates a list of feature vetors and their labels from preprocessed data
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    # @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
+    # @param  dictionary_size [Integer] Size of a dictionary to create if non exists
+    #
+    # @return [Array<FeatureVector>] list of feature vectors and labels
+    def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_and_label_per_data = extract_words data_set, true
+      generate_global_dictionary words_and_label_per_data, dictionary_size
+      words_per_data = words_and_label_per_data.map(&:features)
+      p_map_with_index(words_per_data) do |words,index|
+        word_set = words.uniq
+        make_vector word_set, data_set[index]
+      end
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                bns = bi_normal_seperation(pos, neg, *counts)
+                [word, bns.abs]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+    def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_per_data = extract_words data_set, true
+      generate_global_dictionary words_per_data, dictionary_size
+    end
+    #
+    # extracts the words of all provided data entries
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    # @param  keep_label
+    #
+    # @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
+    def extract_words data_set, keep_label=false
+      data_set.map do |data|
+        extract_words_from_data data, keep_label
+      end
+    end
+  end
+end

data/lib/svm_helper/selectors/bns_ig.rb ADDED

@@ -0,0 +1,50 @@
+require_relative 'bi_normal_seperation'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class BNS_IG < Selector::BiNormalSeperation
+    include IG
+    def label
+      "BiNormalSeperation_InformationGain"
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                bns = bi_normal_seperation(pos, neg, *counts)
+                ig = information_gain(pos, neg, *counts)
+                # use geometric mean of BNS and IG
+                [word, Math.sqrt(bns.abs * ig.abs)]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+  end
+end

data/lib/svm_helper/selectors/calc.rb ADDED

@@ -0,0 +1,71 @@
+module Selector
+  module IG
+    def information_gain(pos, neg, tp, fp)
+      fn = neg - fp
+      tn = pos - tp
+      p_word = (tp + fp).quo(pos + neg)
+      e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
+    end
+    def e(x,y)
+      -xlx(x.quo(x+y)) -xlx(y.quo(x+y))
+    end
+    def xlx(x)
+      x * Math.log2(x)
+    end
+  end
+  module BNS
+    SQR2 = Math.sqrt(2)
+    SQR2PI = Math.sqrt(2.0*Math::PI)
+    def bi_normal_seperation pos, neg, tp, fp
+      false_prositive_rate = fp.quo(neg)
+      true_prositive_rate = tp.quo(pos)
+      bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
+    end
+    # standard normal cumulative distribution function
+    def cdf(z)
+      0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
+    end
+    # inverse standard normal cumulative distribution function
+    # http://home.online.no/~pjacklam/notes/invnorm
+    # Coefficients in rational approximations.
+    A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
+    B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
+    C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
+    D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
+    # Define break-points.
+    P_LOW  = 0.02425
+    P_HIGH = 1.0 - P_LOW
+    def cdf_inverse(p)
+      return 0.0 if p < 0 || p > 1 || p == 0.5
+      x = 0.0
+      if 0.0 < p && p < P_LOW
+        # Rational approximation for lower region.
+        q = Math.sqrt(-2.0*Math.log(p))
+        x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
+            ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
+      elsif P_LOW <= p && p <= P_HIGH
+        # Rational approximation for central region.
+        q = p - 0.5
+        r = q*q
+        x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
+            (((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
+      elsif P_HIGH < p && p < 1.0
+        # Rational approximation for upper region.
+        q = Math.sqrt(-2.0*Math.log(1.0-p))
+        x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
+             ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
+      end
+      if 0 < p && p < 1
+        u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
+        x = x - u/(1.0 + x*u/2.0)
+      end
+      x
+    end
+  end
+end

data/lib/svm_helper/selectors/information_gain.rb ADDED

@@ -0,0 +1,49 @@
+require_relative 'bi_normal_seperation'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class InformationGain < Selector::BiNormalSeperation
+    include IG
+    def label
+      "InformationGain"
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                tp, fp = counts
+                ig = information_gain(pos, neg, tp, fp)
+                [word, ig.abs]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+  end
+end

data/lib/svm_helper/selectors/simple.rb CHANGED

@@ -5,31 +5,20 @@ module Selector
   # @author Andreas Eger
   #
   class Simple
-    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
-    # stopword file
-    #TODO use File.expand_path
-    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    include ::ParallelHelper
     # default dictionary size
     DEFAULT_DICTIONARY_SIZE = 800
-    CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
-                            { function: Pjpp::Function.count,
-                              industry: Pjpp::Industry.count,
-                              career_level: Pjpp::CareerLevel.count }
-                          else
-                            { function: 19,       # 1..19
-                              industry: 632,      # 1..14370 but not all ids used
-                              career_level: 8 }   # 1..8
-                          end
     attr_accessor :global_dictionary
+    attr_reader :classification_encoding,
+                :gram_size,
+                :word_selection
     def initialize classification, args={}
       @classification = classification
       @global_dictionary = args.fetch(:global_dictionary) {[]}
-      @language = args.fetch(:language){'en'}
+      @classification_encoding = args.fetch(:classification_encoding){:bitmap}
+      @word_selection = args.fetch(:word_selection){ :single }
+      @gram_size = args.fetch(:gram_size) { 1 }
       @parallel = args.fetch(:parallel){false}
     end
@@ -48,7 +37,7 @@ module Selector
       words_per_data = extract_words data_set
       generate_global_dictionary words_per_data, dictionary_size
-      make_vectors(words_per_data) do |words,index|
+      p_map_with_index(words_per_data) do |words,index|
         word_set = words.uniq
         make_vector word_set, data_set[index]
       end
@@ -66,15 +55,6 @@ module Selector
       make_vector word_set, data, dictionary
     end
-    #
-    # loads a txt file with stop words
-    # @param  location String folder with stopword lists
-    #
-    # @return [Array<String>] Array of stopwords
-    def stopwords(location=STOPWORD_LOCATION)
-      @stopwords ||= IO.read(File.join(location,@language)).split
-    end
     #
     # generates a list of words used as dictionary
     # @param  all_words (see #extract_words)
@@ -90,6 +70,10 @@ module Selector
       @global_dictionary = words.last(size).map(&:first).reverse
     end
+    def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_per_data = extract_words data_set
+      generate_global_dictionary words_per_data, dictionary_size
+    end
     #
     # extracts the words of all provided data entries
     # @param  data_set [Array<PreprocessedData>] list of preprocessed data
@@ -107,7 +91,46 @@ module Selector
     #
     # @return [Array<String>] list of words
     def extract_words_from_data data
-      (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
+      words = (data.data.flat_map(&:split) - stopwords)
+                  .delete_if { |e| e.size <= 2 }
+      if gram_size > 1
+        words = words.each_cons(@gram_size).map{|e| e.join " " }
+      end
+      words
+    end
+    #
+    # fetches all words and two word phrases from one data entry, removes stopwords and very short words
+    # @param  data [PreprocessedData] preprocessed data entry
+    # @param  keep_label
+    #
+    # @return [OpenStruct<Array<String>,Boolean>] list of words
+    def extract_words_from_data data, keep_label=false
+      # assume the first token is the title an preserve it
+      title, *words = data.data.flatten
+      features =  case word_selection
+                  when :grams
+                    words.each_cons(@gram_size).map{|e| e.join " " }
+                  when :grams1_2
+                    words + words.each_cons(2).map{|e| e.join " " }
+                  when :grams1_2_3
+                    words +
+                      words.each_cons(2).map{|e| e.join " " } +
+                      words.each_cons(3).map{|e| e.join " " }
+                  when :grams1_2_3_4
+                    words +
+                      words.each_cons(2).map{|e| e.join " " } +
+                      words.each_cons(3).map{|e| e.join " " } +
+                      words.each_cons(4).map{|e| e.join " " }
+                  else
+                    words
+                  end
+      features.unshift(title)
+      return features unless keep_label
+      OpenStruct.new(
+        features: features,
+        label: data.label
+      )
     end
     def reset classification
@@ -135,23 +158,40 @@ module Selector
       )
     end
-    def make_vectors data, &block
-      if @parallel && RUBY_PLATFORM == 'java'
-        Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
-      elsif @parallel
-        Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
-      else
-        data.map.with_index {|e,i| yield e,i }
-      end
-    end
+    BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
+                            { function: Pjpp::Function.count,
+                              industry: Pjpp::Industry.count,
+                              career_level: Pjpp::CareerLevel.count }
+                          else
+                            { function: 19,       # 1..19
+                              industry: 632,      # 1..14370 but not all ids used
+                              career_level: 8 }   # 1..8
+                          end
+    BINARY_ARRAY_SIZES = {
+            function: 8,        # max id 255, currently 19
+            industry: 16,       # max id 65535, currently 14370
+            career_level: 4 }   # max id 15, currently 8
     #
     # creates the classification specific part of the feature vector
     # @param  ids [Hash] hash with classification ids
     #
     # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
     def classification_array(id)
-      Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
+      case @classification_encoding
+      when :binary
+        number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
+      else # :bitmap
+        Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
+      end
+    end
+    def number_to_binary_array(number, size=8)
+      a=[]
+      (size-1).downto(0) do |i|
+        a<<number[i]
+      end
+      a
     end
   end
-end
+end

data/lib/svm_helper/stopwords/fr ADDED

@@ -0,0 +1,124 @@
+alors
+au
+aucuns
+aussi
+autre
+avant
+avec
+avoir
+bon
+car
+ce
+cela
+ces
+ceux
+chaque
+ci
+comme
+comment
+dans
+des
+du
+dedans
+dehors
+depuis
+deux
+devrait
+doit
+donc
+dos
+droite
+début
+elle
+elles
+en
+encore
+essai
+est
+et
+eu
+fait
+faites
+fois
+font
+force
+haut
+hors
+ici
+il
+ils
+je  juste
+la
+le
+les
+leur
+là
+ma
+maintenant
+mais
+mes
+mine
+moins
+mon
+mot
+même
+ni
+nommés
+notre
+nous
+nouveaux
+ou
+où
+par
+parce
+parole
+pas
+personnes
+peut
+peu
+pièce
+plupart
+pour
+pourquoi
+quand
+que
+quel
+quelle
+quelles
+quels
+qui
+sa
+sans
+ses
+seulement
+si
+sien
+son
+sont
+sous
+soyez sujet
+sur
+ta
+tandis
+tellement
+tels
+tes
+ton
+tous
+tout
+trop
+très
+tu
+valeur
+voie
+voient
+vont
+votre
+vous
+vu
+ça
+étaient
+état
+étions
+été
+être

data/lib/svm_helper/version.rb CHANGED

@@ -1,3 +1,3 @@
 module SvmHelper
-  VERSION = "0.1.1"
+  VERSION = "0.2.1"
 end

data/spec/factories.rb CHANGED

@@ -18,14 +18,15 @@ FactoryGirl.define do
   factory :data, class: PreprocessedData do
-    data ["haus fooo garten baaz pferd fooo"]
+    data %w(haus fooo garten baaz pferd fooo)
     id 7
     label true
   end
   factory :data_w_short_words, parent: :data do
-    data ["auto foo pferd bz gooo fooo 2"]
+    data %w(auto pferd gooo fooo)
+    label false
   end
   factory :data_w_multiple_sections, parent: :data do
-    data ["meeh foo auto","bz baaz fooo 2"]
+    data [%w(meeh auto),%w(baaz fooo)]
   end
 end

data/spec/support/selector_spec.rb CHANGED

@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
       [0,1].should include(e)
     end
   end
-  it "should be able to process multiple data entries at once" do
-    selector.generate_vectors([data]).each do |e|
-      e.should == selector.generate_vector(data)
-    end
+  it "should respond to generate_vectors" do
+    selector.should respond_to(:generate_vectors)
   end
 end

data/spec/svm_helper/parallel_helper_spec.rb ADDED

@@ -0,0 +1,17 @@
+require 'spec_helper'
+require 'parallel'
+include ParallelHelper
+describe ParallelHelper do
+  let(:data) { (1..20).to_a }
+  context "parallel map" do
+    it "should return as a normal map" do
+      p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
+    end
+  end
+  context "parallel map with index" do
+    it "should return as a normal map with index" do
+      p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
+    end
+  end
+end

data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb RENAMED

File without changes

data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb RENAMED

@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
     end
   end
   context "#clean_title" do
     it "should be downcased" do
       job = FactoryGirl.build(:job_title_downcasing)
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
         FactoryGirl.build(:job_description_w_code_token),
         FactoryGirl.build(:job_description_w_gender) ]
     }
+    it "should call strip_stopwords" do
+      simple.expects(:strip_stopwords)
+      simple.clean_description(jobs[0][:description])
+    end
     it "should remove html/xml tags" do
-      desc = simple.clean_description(jobs[0][:description])
+      desc = simple.clean_description(jobs[0][:description]).join ' '
       desc.should_not match(/<(.*?)>/)
     end
     it "should remove new lines" do
-      desc = simple.clean_description(jobs[0][:description])
+      desc = simple.clean_description(jobs[0][:description]).join ' '
       desc.should_not match(/\r\n|\n|\r/)
     end
     it "should remove all special characters" do
-      desc = simple.clean_description(jobs[2][:description])
+      desc = simple.clean_description(jobs[2][:description]).join ' '
       desc.should_not match(/[^a-z öäü]/i)
     end
     it "should remove gender tokens" do
-      desc = simple.clean_description(jobs[3][:description])
+      desc = simple.clean_description(jobs[3][:description]).join ' '
       desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
     end
     it "should remove job code token" do
-      desc = simple.clean_description(jobs[4][:description])
+      desc = simple.clean_description(jobs[4][:description]).join ' '
       desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
     end
     it "should be downcased" do
-      desc = simple.clean_description(jobs[2][:description])
+      desc = simple.clean_description(jobs[2][:description]).join ' '
       desc.should_not match(/[^a-z öäü]/)
     end
   end
+  context "strip_stopwords" do
+    it "should remove words like 'and' from the text" do
+      simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
+    end
+  end
   context "parallel" do
       let(:parallel) { Preprocessor::Simple.new(parallel: true) }

data/spec/svm_helper/preprocessors/stemming_spec.rb ADDED

@@ -0,0 +1,11 @@
+require 'spec_helper'
+describe Preprocessor::Stemming do
+  it_behaves_like 'a preprocessor'
+  let(:preprocessor) { Preprocessor::Stemming.new }
+  let(:job) { FactoryGirl.build(:job) }
+  let(:jobs) { [job] }
+  it "should reduce words to their stem" do
+    preprocessor.clean_description("developer engineering").should == %w(develop engin)
+  end
+end

data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb ADDED

@@ -0,0 +1,35 @@
+require "spec_helper"
+describe Selector::BiNormalSeperation do
+  it_behaves_like 'a selector'
+  let(:bns) { Selector::BiNormalSeperation.new(:function) }
+  context "#extract_words_from_data" do
+    it "should generate a list of words from the data" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data))
+      words.should have(10).things
+    end
+    it "should remove words with 3 characters or less" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
+      words.should have(6).things
+    end
+    it "should process multiple sections in the data" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
+      words.should have(6).things
+    end
+  end
+  context "#generate_global_dictionary" do
+    let(:data) { [FactoryGirl.build_list(:data,1),
+                  FactoryGirl.build_list(:data_w_short_words,4),
+                  FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
+    let(:words_per_data) { bns.extract_words(data,true) }
+    it "should return a list of n words" do
+      bns.generate_global_dictionary(words_per_data,2)
+      bns.global_dictionary.should have(2).things
+    end
+    it "should return a list of the n most used words in the data array" do
+      bns.generate_global_dictionary(words_per_data,3)
+      bns.global_dictionary.should eq(%w(fooo auto pferd))
+    end
+  end
+end

data/spec/svm_helper/selectors/bns_ig_spec.rb ADDED

@@ -0,0 +1,5 @@
+require "spec_helper"
+describe Selector::BNS_IG do
+  it_behaves_like 'a selector'
+end

data/spec/svm_helper/selectors/calc_spec.rb ADDED

@@ -0,0 +1,42 @@
+require 'spec_helper'
+# just some very basic test to make sure these functions do not fail
+describe "Calc" do
+  include Selector::IG
+  include Selector::BNS
+  let(:test_data){ [
+    [34, 23, 28, 17],
+    [31, 17, 23, 12],
+    [44, 39, 41, 36],
+    [44, 23, 41, 23],
+    [44, 39, 0, 36],
+    [44, 39, 41, 0],
+    [62, 81, 15, 73]
+  ]}
+  context Selector::IG do
+    it "should not fail" do
+      test_data.each do |data|
+        ->{information_gain(*data)}.should_not raise_error
+      end
+    end
+    it "should return some values" do
+      test_data.each do |data|
+        information_gain(*data).should be_a(Numeric)
+      end
+    end
+  end
+  context Selector::BNS do
+    it "should not fail" do
+      test_data.each do |data|
+        ->{bi_normal_seperation(*data)}.should_not raise_error
+      end
+    end
+    it "should return some values" do
+      test_data.each do |data|
+        bi_normal_seperation(*data).should be_a(Numeric)
+      end
+    end
+  end
+end

data/spec/svm_helper/selectors/information_gain_spec.rb ADDED

@@ -0,0 +1,5 @@
+require "spec_helper"
+describe Selector::InformationGain do
+  it_behaves_like 'a selector'
+end

data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb RENAMED

@@ -1,9 +1,7 @@
 require "spec_helper"
-describe Selector::NGram do
-  it_behaves_like 'a selector'
-  let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
+describe "n-grams" do
+  let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
   context "#extract_words_from_data" do
     it "should generate a list of words from the data" do
       words = ngram.extract_words_from_data(FactoryGirl.build(:data))

data/spec/{selectors → svm_helper/selectors}/simple_spec.rb RENAMED

@@ -7,9 +7,6 @@ describe Selector::Simple do
   it "should have select_feature_vector implemented" do
     expect { simple.generate_vectors([]) }.to_not raise_error
   end
-  context "#stopwords" do
-    it "simply loads them from a file"
-  end
   context "#extract_words_from_data" do
     it "should generate a list of words from the data" do
       words = simple.extract_words_from_data(FactoryGirl.build(:data))
@@ -50,6 +47,19 @@ describe Selector::Simple do
       simple.global_dictionary.should eq(%w(fooo auto baaz))
     end
   end
+  context "#build_dictionary" do
+    let(:data) { [FactoryGirl.build_list(:data,1),
+                  FactoryGirl.build_list(:data_w_short_words,2),
+                  FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
+    it "should return a list of n words" do
+      simple.build_dictionary(data,2)
+      simple.global_dictionary.should have(2).things
+    end
+    it "should return a list of the n most used words in the data array" do
+      simple.build_dictionary(data,3)
+      simple.global_dictionary.should eq(%w(fooo auto baaz))
+    end
+  end
   context "#generate_vector" do
     let(:dictionary) { %w(auto pferd haus hase garten) }
     let(:data) { FactoryGirl.build(:data) }
@@ -109,6 +119,7 @@ describe Selector::Simple do
     context "parallel" do
       let(:parallel) { Selector::Simple.new(:function, parallel: true) }
       before(:each) do
+        require 'parallel'
         simple.stubs(:global_dictionary).returns(dictionary)
         parallel.stubs(:global_dictionary).returns(dictionary)
       end
@@ -119,4 +130,4 @@ describe Selector::Simple do
       end
     end
   end
-end
+end

data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb RENAMED

@@ -1,8 +1,7 @@
 require "spec_helper"
-describe Selector::WithBinaryEncoding do
-  it_behaves_like 'a selector'
-  let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
+describe "binary encoded classification" do
+  let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
   let(:dictionary) { %w(auto pferd haus hase garten) }
   let(:data) { FactoryGirl.build(:data) }

data/svm_helper.gemspec CHANGED

@@ -17,5 +17,5 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.add_dependency('parallel', '~> 0.6.2')
+  gem.add_dependency "ruby-stemmer"
 end

metadata CHANGED

@@ -1,32 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: svm_helper
 version: !ruby/object:Gem::Version
-  version: 0.1.1
-  prerelease:
+  version: 0.2.1
 platform: ruby
 authors:
 - Andreas Eger
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-03-15 00:00:00.000000000 Z
+date: 2013-04-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: parallel
+  name: ruby-stemmer
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 0.6.2
+        version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 0.6.2
+        version: '0'
 description: Shared helper classes for usage in context of SVM at experteer
 email:
 - dev@eger-andreas.de
@@ -48,16 +45,21 @@ files:
 - lib/svm_helper.rb
 - lib/svm_helper/feature_vector.rb
 - lib/svm_helper/interface_helper.rb
+- lib/svm_helper/parallel_helper.rb
 - lib/svm_helper/preprocessed_data.rb
 - lib/svm_helper/preprocessors.rb
 - lib/svm_helper/preprocessors/id_mapping.rb
 - lib/svm_helper/preprocessors/simple.rb
+- lib/svm_helper/preprocessors/stemming.rb
 - lib/svm_helper/selectors.rb
-- lib/svm_helper/selectors/n_gram.rb
+- lib/svm_helper/selectors/bi_normal_seperation.rb
+- lib/svm_helper/selectors/bns_ig.rb
+- lib/svm_helper/selectors/calc.rb
+- lib/svm_helper/selectors/information_gain.rb
 - lib/svm_helper/selectors/simple.rb
-- lib/svm_helper/selectors/with_binary_encoding.rb
 - lib/svm_helper/stopwords/de
 - lib/svm_helper/stopwords/en
+- lib/svm_helper/stopwords/fr
 - lib/svm_helper/version.rb
 - spec/factories.rb
 - spec/factories/jobs/tmp.html
@@ -65,44 +67,43 @@ files:
 - spec/factories/jobs/tmp3.html
 - spec/factories/jobs_with_description.rb
 - spec/factories/jobs_with_title.rb
-- spec/preprocessors/id_mapping_spec.rb
-- spec/preprocessors/simple_spec.rb
-- spec/selectors/n_gram_spec.rb
-- spec/selectors/simple_spec.rb
-- spec/selectors/with_binary_encoding_spec.rb
 - spec/spec_helper.rb
 - spec/support/preprocessor_spec.rb
 - spec/support/selector_spec.rb
+- spec/svm_helper/parallel_helper_spec.rb
+- spec/svm_helper/preprocessors/id_mapping_spec.rb
+- spec/svm_helper/preprocessors/simple_spec.rb
+- spec/svm_helper/preprocessors/stemming_spec.rb
+- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
+- spec/svm_helper/selectors/bns_ig_spec.rb
+- spec/svm_helper/selectors/calc_spec.rb
+- spec/svm_helper/selectors/information_gain_spec.rb
+- spec/svm_helper/selectors/n_gram_spec.rb
+- spec/svm_helper/selectors/simple_spec.rb
+- spec/svm_helper/selectors/with_binary_encoding_spec.rb
 - svm_helper.gemspec
 homepage: https://github.com/sch1zo/svm_helper
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2037039748537332986
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2037039748537332986
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.25
+rubygems_version: 2.0.0.rc.2
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
 test_files:
 - spec/factories.rb
@@ -111,12 +112,18 @@ test_files:
 - spec/factories/jobs/tmp3.html
 - spec/factories/jobs_with_description.rb
 - spec/factories/jobs_with_title.rb
-- spec/preprocessors/id_mapping_spec.rb
-- spec/preprocessors/simple_spec.rb
-- spec/selectors/n_gram_spec.rb
-- spec/selectors/simple_spec.rb
-- spec/selectors/with_binary_encoding_spec.rb
 - spec/spec_helper.rb
 - spec/support/preprocessor_spec.rb
 - spec/support/selector_spec.rb
+- spec/svm_helper/parallel_helper_spec.rb
+- spec/svm_helper/preprocessors/id_mapping_spec.rb
+- spec/svm_helper/preprocessors/simple_spec.rb
+- spec/svm_helper/preprocessors/stemming_spec.rb
+- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
+- spec/svm_helper/selectors/bns_ig_spec.rb
+- spec/svm_helper/selectors/calc_spec.rb
+- spec/svm_helper/selectors/information_gain_spec.rb
+- spec/svm_helper/selectors/n_gram_spec.rb
+- spec/svm_helper/selectors/simple_spec.rb
+- spec/svm_helper/selectors/with_binary_encoding_spec.rb
 has_rdoc:

data/lib/svm_helper/selectors/n_gram.rb DELETED

@@ -1,31 +0,0 @@
-require_relative 'simple'
-module Selector
-  #
-  # Selector which uses a n-gram dictionary to generate feature vectors
-  #
-  # @author Andreas Eger
-  #
-  class NGram < Selector::Simple
-    attr_reader :gram_size
-    def initialize classification, args={}
-      super
-      @gram_size = args.fetch(:gram_size) { 2 }
-    end
-    def label
-      "ngram"
-    end
-    #
-    # fetches all words snippets from one data entry, removes stopwords and very short words
-    # @param  data [PreprocessedData]
-    # @param  gram_size [Integer] gram size
-    #
-    # @return [Array<String>]
-    def extract_words_from_data data, gram_size=@gram_size
-      (data.data.flat_map(&:split) - stopwords)
-                .delete_if { |e| e.size <= 3 }
-                .each_cons(gram_size).map{|e| e.join " " }
-    end
-  end
-end

data/lib/svm_helper/selectors/with_binary_encoding.rb DELETED

@@ -1,41 +0,0 @@
-require_relative 'simple'
-module Selector
-  #
-  # Selector which uses a n-gram dictionary to generate feature vectors
-  #
-  # @author Andreas Eger
-  #
-  class WithBinaryEncoding < Selector::Simple
-    CLASSIFICATIONS_SIZE = {
-          function: 8,        # max id 255, currently 19
-          industry: 16,       # max id 65535, currently 14370
-          career_level: 4 }   # max id 15, currently 8
-    def initialize *args
-      super
-    end
-    def label
-      "simple-WithBinaryEncoding"
-    end
-    private
-    #
-    # creates the classification specific part of the feature vector
-    # @param  ids [Hash] hash with classification ids
-    #
-    # @return [Array<Integer>] binary encoded classification id
-    def classification_array(id)
-      number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
-    end
-    def number_to_binary_array(number, size=8)
-      a=[]
-      (size-1).downto(0) do |i|
-        a<<number[i]
-      end
-      a
-    end
-  end
-end