RubyGems - svm_helper - Versions diffs - 0.1.1 → 0.2.1 - Mend

svm_helper 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +7 -0
data/Gemfile +1 -0
data/Guardfile +1 -1
data/lib/svm_helper.rb +1 -2
data/lib/svm_helper/parallel_helper.rb +24 -0
data/lib/svm_helper/preprocessors.rb +1 -0
data/lib/svm_helper/preprocessors/simple.rb +31 -22
data/lib/svm_helper/preprocessors/stemming.rb +31 -0
data/lib/svm_helper/selectors.rb +4 -2
data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
data/lib/svm_helper/selectors/bns_ig.rb +50 -0
data/lib/svm_helper/selectors/calc.rb +71 -0
data/lib/svm_helper/selectors/information_gain.rb +49 -0
data/lib/svm_helper/selectors/simple.rb +80 -40
data/lib/svm_helper/stopwords/fr +124 -0
data/lib/svm_helper/version.rb +1 -1
data/spec/factories.rb +4 -3
data/spec/support/selector_spec.rb +2 -4
data/spec/svm_helper/parallel_helper_spec.rb +17 -0
data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
data/spec/svm_helper/selectors/calc_spec.rb +42 -0
data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
data/svm_helper.gemspec +1 -1
metadata +39 -32
data/lib/svm_helper/selectors/n_gram.rb +0 -31
data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
+  data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
+SHA512:
+  metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
+  data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8

data/Gemfile CHANGED

@@ -21,4 +21,5 @@ group :test do
   gem 'rake'
   gem 'mocha', require: 'mocha/api'
   gem 'factory_girl', '~> 4.0'
+  gem 'parallel', require: false
 end

data/Guardfile CHANGED

@@ -1,7 +1,7 @@
 guard 'rspec', cli: "--color --format p", all_after_pass: false do
 # guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
   watch(%r{^spec/.+_spec\.rb$})
-  watch(%r{^lib/svm_helper/(.+)\.rb$})               { |m| "spec/#{m[1]}_spec.rb" }
+  watch(%r{^lib/(.+)\.rb$})               { |m| "spec/#{m[1]}_spec.rb" }
   watch('spec/spec_helper.rb')            { 'spec' }
   watch('spec/factories.rb')              { 'spec' }
   watch(%r{^spec/factories/(.+)\.rb})     { 'spec' }

data/lib/svm_helper.rb CHANGED

@@ -1,8 +1,7 @@
 require "svm_helper/version"
-require 'parallel'
+require "svm_helper/parallel_helper"
 require "svm_helper/preprocessed_data"
 require "svm_helper/feature_vector"
 require "svm_helper/preprocessors"
 require "svm_helper/selectors"

data/lib/svm_helper/parallel_helper.rb ADDED

@@ -0,0 +1,24 @@
+module ParallelHelper
+  THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+  def p_map_with_index data, &block
+    if parallel? && RUBY_PLATFORM == 'java'
+      Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
+    elsif parallel?
+      Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
+    else
+      data.map.with_index {|e,i| yield e,i }
+    end
+  end
+  def p_map data, &block
+    if parallel? && RUBY_PLATFORM == 'java'
+      Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
+    elsif parallel?
+      Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
+    else
+      data.map {|e| yield e }
+    end
+  end
+  def parallel?
+    defined?(Parallel) == 'constant' && @parallel
+  end
+end

data/lib/svm_helper/preprocessors.rb CHANGED

@@ -1,2 +1,3 @@
 require_relative 'preprocessors/simple'
+require_relative 'preprocessors/stemming'
 require_relative 'preprocessors/id_mapping'

data/lib/svm_helper/preprocessors/simple.rb CHANGED

@@ -6,7 +6,7 @@ module Preprocessor
   # @author Andreas Eger
   #
   class Simple
-    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+    include ::ParallelHelper
     # filters most gender stuff
     GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
     # filters most wierd symbols
@@ -25,8 +25,16 @@ module Preprocessor
     # filter for used job tokens
     CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
+    # stopword file
+    #TODO use File.expand_path
+    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    attr_accessor :language
     def initialize args={}
+      @language = args.fetch(:language){'en'}
       @parallel = args.fetch(:parallel){false}
+      @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
     end
     def label
@@ -48,12 +56,20 @@ module Preprocessor
     # @return [Array<PreprocessedData>] list of processed job data - or singe job data
     def process jobs
       if jobs.is_a? Array
-        process_jobs jobs
+        p_map(jobs) {|job| process_job job }
       else
         process_job jobs
       end
     end
+    #
+    # loads a txt file with stop words
+    # @param  location String folder with stopword lists
+    #
+    # @return [Array<String>] Array of stopwords
+    def strip_stopwords(text)
+      (text.split - @stopwords).delete_if { |e| e.size <= 2 }
+    end
     #
     # converts string into a cleaner version
@@ -75,29 +91,22 @@ module Preprocessor
     #
     # @return [String] clean and lowercase version of input
     def clean_description desc
-      desc.gsub(XML_TAG_FILTER,' ')
-          .gsub(EMAIL_FILTER,'')
-          .gsub(URL_FILTER,'')
-          .gsub(GENDER_FILTER,'')
-          .gsub(NEW_LINES,'')
-          .gsub(SYMBOL_FILTER,' ')
-          .gsub(WHITESPACE,' ')
-          .gsub(WORDS_IN_BRACKETS, '\1')
-          .gsub(CODE_TOKEN_FILTER,'')
-          .downcase
-          .strip
+      strip_stopwords(
+        desc.gsub(XML_TAG_FILTER,' ')
+            .gsub(EMAIL_FILTER,'')
+            .gsub(URL_FILTER,'')
+            .gsub(GENDER_FILTER,'')
+            .gsub(NEW_LINES,'')
+            .gsub(SYMBOL_FILTER,' ')
+            .gsub(WHITESPACE,' ')
+            .gsub(WORDS_IN_BRACKETS, '\1')
+            .gsub(CODE_TOKEN_FILTER,'')
+            .downcase
+            .strip
+        )
     end
     private
-    def process_jobs jobs
-      if @parallel && RUBY_PLATFORM == 'java'
-        Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
-      elsif @parallel
-        Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
-      else
-        jobs.map {|job| process_job job }
-      end
-    end
     def process_job job
       PreprocessedData.new(

data/lib/svm_helper/preprocessors/stemming.rb ADDED

@@ -0,0 +1,31 @@
+require_relative 'simple'
+require 'lingua/stemmer'
+module Preprocessor
+  #
+  # Preprocessor Base Class
+  #
+  # @author Andreas Eger
+  #
+  class Stemming < Simple
+    def initialize(args={})
+      super
+      @stemmer = Lingua::Stemmer.new(language: @language)
+    end
+    def label
+      "with_stemming"
+    end
+    def clean_description desc
+      super.map{|w| @stemmer.stem(w) }
+    end
+    private
+    def process_job job
+      PreprocessedData.new(
+        data: [clean_title(job[:title]), clean_description(job[:description])],
+        id: job[:id],
+        label: job[:label]
+      )
+    end
+  end
+end

data/lib/svm_helper/selectors.rb CHANGED

@@ -1,3 +1,5 @@
 require_relative 'selectors/simple'
-require_relative 'selectors/n_gram'
-require_relative 'selectors/with_binary_encoding'
+require_relative 'selectors/calc'
+require_relative 'selectors/bi_normal_seperation'
+require_relative 'selectors/information_gain'
+require_relative 'selectors/bns_ig'

data/lib/svm_helper/selectors/bi_normal_seperation.rb ADDED

@@ -0,0 +1,86 @@
+require_relative 'simple'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class BiNormalSeperation < Selector::Simple
+    include BNS
+    def label
+      "BiNormalSeperation"
+    end
+    def initialize classification, args={}
+      super
+      @word_selection = args.fetch(:word_selection){ :grams1_2 }
+    end
+    #
+    # generates a list of feature vetors and their labels from preprocessed data
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    # @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
+    # @param  dictionary_size [Integer] Size of a dictionary to create if non exists
+    #
+    # @return [Array<FeatureVector>] list of feature vectors and labels
+    def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_and_label_per_data = extract_words data_set, true
+      generate_global_dictionary words_and_label_per_data, dictionary_size
+      words_per_data = words_and_label_per_data.map(&:features)
+      p_map_with_index(words_per_data) do |words,index|
+        word_set = words.uniq
+        make_vector word_set, data_set[index]
+      end
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                bns = bi_normal_seperation(pos, neg, *counts)
+                [word, bns.abs]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+    def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_per_data = extract_words data_set, true
+      generate_global_dictionary words_per_data, dictionary_size
+    end
+    #
+    # extracts the words of all provided data entries
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    # @param  keep_label
+    #
+    # @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
+    def extract_words data_set, keep_label=false
+      data_set.map do |data|
+        extract_words_from_data data, keep_label
+      end
+    end
+  end
+end

data/lib/svm_helper/selectors/bns_ig.rb ADDED

@@ -0,0 +1,50 @@
+require_relative 'bi_normal_seperation'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class BNS_IG < Selector::BiNormalSeperation
+    include IG
+    def label
+      "BiNormalSeperation_InformationGain"
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                bns = bi_normal_seperation(pos, neg, *counts)
+                ig = information_gain(pos, neg, *counts)
+                # use geometric mean of BNS and IG
+                [word, Math.sqrt(bns.abs * ig.abs)]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+  end
+end

data/lib/svm_helper/selectors/calc.rb ADDED

@@ -0,0 +1,71 @@
+module Selector
+  module IG
+    def information_gain(pos, neg, tp, fp)
+      fn = neg - fp
+      tn = pos - tp
+      p_word = (tp + fp).quo(pos + neg)
+      e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
+    end
+    def e(x,y)
+      -xlx(x.quo(x+y)) -xlx(y.quo(x+y))
+    end
+    def xlx(x)
+      x * Math.log2(x)
+    end
+  end
+  module BNS
+    SQR2 = Math.sqrt(2)
+    SQR2PI = Math.sqrt(2.0*Math::PI)
+    def bi_normal_seperation pos, neg, tp, fp
+      false_prositive_rate = fp.quo(neg)
+      true_prositive_rate = tp.quo(pos)
+      bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
+    end
+    # standard normal cumulative distribution function
+    def cdf(z)
+      0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
+    end
+    # inverse standard normal cumulative distribution function
+    # http://home.online.no/~pjacklam/notes/invnorm
+    # Coefficients in rational approximations.
+    A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
+    B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
+    C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
+    D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
+    # Define break-points.
+    P_LOW  = 0.02425
+    P_HIGH = 1.0 - P_LOW
+    def cdf_inverse(p)
+      return 0.0 if p < 0 || p > 1 || p == 0.5
+      x = 0.0
+      if 0.0 < p && p < P_LOW
+        # Rational approximation for lower region.
+        q = Math.sqrt(-2.0*Math.log(p))
+        x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
+            ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
+      elsif P_LOW <= p && p <= P_HIGH
+        # Rational approximation for central region.
+        q = p - 0.5
+        r = q*q
+        x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
+            (((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
+      elsif P_HIGH < p && p < 1.0
+        # Rational approximation for upper region.
+        q = Math.sqrt(-2.0*Math.log(1.0-p))
+        x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
+             ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
+      end
+      if 0 < p && p < 1
+        u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
+        x = x - u/(1.0 + x*u/2.0)
+      end
+      x
+    end
+  end
+end

data/lib/svm_helper/selectors/information_gain.rb ADDED

@@ -0,0 +1,49 @@
+require_relative 'bi_normal_seperation'
+module Selector
+  #
+  # Feature Selection for Text Classification - HP Labs
+  # http://www.google.com/patents/US20040059697
+  #
+  class InformationGain < Selector::BiNormalSeperation
+    include IG
+    def label
+      "InformationGain"
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      label_counts = [0,0]
+      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
+        label = bag.label ? 1 : 0
+        label_counts[label] += 1
+        # only count a feature once per bag
+        bag.features.uniq.each do |word|
+          unless accumulator.has_key?(word)
+            accumulator[word] = [0,0]
+          end
+          accumulator[word][label] += 1
+        end
+        accumulator
+      end
+      neg, pos = label_counts
+      words = p_map(features) do |word, counts|
+                next if counts.any? { |e| e==0 } # skip words only appearing in one class
+                tp, fp = counts
+                ig = information_gain(pos, neg, tp, fp)
+                [word, ig.abs]
+              end
+      @global_dictionary = words.compact
+                                .sort_by{|e| e[1]}
+                                .last(size)
+                                .map{|e| e[0] }
+    end
+  end
+end

data/lib/svm_helper/selectors/simple.rb CHANGED

@@ -5,31 +5,20 @@ module Selector
   # @author Andreas Eger
   #
   class Simple
-    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
-    # stopword file
-    #TODO use File.expand_path
-    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    include ::ParallelHelper
     # default dictionary size
     DEFAULT_DICTIONARY_SIZE = 800
-    CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
-                            { function: Pjpp::Function.count,
-                              industry: Pjpp::Industry.count,
-                              career_level: Pjpp::CareerLevel.count }
-                          else
-                            { function: 19,       # 1..19
-                              industry: 632,      # 1..14370 but not all ids used
-                              career_level: 8 }   # 1..8
-                          end
     attr_accessor :global_dictionary
+    attr_reader :classification_encoding,
+                :gram_size,
+                :word_selection
     def initialize classification, args={}
       @classification = classification
       @global_dictionary = args.fetch(:global_dictionary) {[]}
-      @language = args.fetch(:language){'en'}
+      @classification_encoding = args.fetch(:classification_encoding){:bitmap}
+      @word_selection = args.fetch(:word_selection){ :single }
+      @gram_size = args.fetch(:gram_size) { 1 }
       @parallel = args.fetch(:parallel){false}
     end
@@ -48,7 +37,7 @@ module Selector
       words_per_data = extract_words data_set
       generate_global_dictionary words_per_data, dictionary_size
-      make_vectors(words_per_data) do |words,index|
+      p_map_with_index(words_per_data) do |words,index|
         word_set = words.uniq
         make_vector word_set, data_set[index]
       end
@@ -66,15 +55,6 @@ module Selector
       make_vector word_set, data, dictionary
     end
-    #
-    # loads a txt file with stop words
-    # @param  location String folder with stopword lists
-    #
-    # @return [Array<String>] Array of stopwords
-    def stopwords(location=STOPWORD_LOCATION)
-      @stopwords ||= IO.read(File.join(location,@language)).split
-    end
     #
     # generates a list of words used as dictionary
     # @param  all_words (see #extract_words)
@@ -90,6 +70,10 @@ module Selector
       @global_dictionary = words.last(size).map(&:first).reverse
     end
+    def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_per_data = extract_words data_set
+      generate_global_dictionary words_per_data, dictionary_size
+    end
     #
     # extracts the words of all provided data entries
     # @param  data_set [Array<PreprocessedData>] list of preprocessed data
@@ -107,7 +91,46 @@ module Selector
     #
     # @return [Array<String>] list of words
     def extract_words_from_data data
-      (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
+      words = (data.data.flat_map(&:split) - stopwords)
+                  .delete_if { |e| e.size <= 2 }
+      if gram_size > 1
+        words = words.each_cons(@gram_size).map{|e| e.join " " }
+      end
+      words
+    end
+    #
+    # fetches all words and two word phrases from one data entry, removes stopwords and very short words
+    # @param  data [PreprocessedData] preprocessed data entry
+    # @param  keep_label
+    #
+    # @return [OpenStruct<Array<String>,Boolean>] list of words
+    def extract_words_from_data data, keep_label=false
+      # assume the first token is the title an preserve it
+      title, *words = data.data.flatten
+      features =  case word_selection
+                  when :grams
+                    words.each_cons(@gram_size).map{|e| e.join " " }
+                  when :grams1_2
+                    words + words.each_cons(2).map{|e| e.join " " }
+                  when :grams1_2_3
+                    words +
+                      words.each_cons(2).map{|e| e.join " " } +
+                      words.each_cons(3).map{|e| e.join " " }
+                  when :grams1_2_3_4
+                    words +
+                      words.each_cons(2).map{|e| e.join " " } +
+                      words.each_cons(3).map{|e| e.join " " } +
+                      words.each_cons(4).map{|e| e.join " " }
+                  else
+                    words
+                  end
+      features.unshift(title)
+      return features unless keep_label
+      OpenStruct.new(
+        features: features,
+        label: data.label
+      )
     end
     def reset classification
@@ -135,23 +158,40 @@ module Selector
       )
     end
-    def make_vectors data, &block
-      if @parallel && RUBY_PLATFORM == 'java'
-        Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
-      elsif @parallel
-        Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
-      else
-        data.map.with_index {|e,i| yield e,i }
-      end
-    end
+    BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
+                            { function: Pjpp::Function.count,
+                              industry: Pjpp::Industry.count,
+                              career_level: Pjpp::CareerLevel.count }
+                          else
+                            { function: 19,       # 1..19
+                              industry: 632,      # 1..14370 but not all ids used
+                              career_level: 8 }   # 1..8
+                          end
+    BINARY_ARRAY_SIZES = {
+            function: 8,        # max id 255, currently 19
+            industry: 16,       # max id 65535, currently 14370
+            career_level: 4 }   # max id 15, currently 8
     #
     # creates the classification specific part of the feature vector
     # @param  ids [Hash] hash with classification ids
     #
     # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
     def classification_array(id)
-      Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
+      case @classification_encoding
+      when :binary
+        number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
+      else # :bitmap
+        Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
+      end
+    end
+    def number_to_binary_array(number, size=8)
+      a=[]
+      (size-1).downto(0) do |i|
+        a<<number[i]
+      end
+      a
     end
   end
-end
+end

data/lib/svm_helper/stopwords/fr ADDED

@@ -0,0 +1,124 @@
+alors
+au
+aucuns
+aussi
+autre
+avant
+avec
+avoir
+bon
+car
+ce
+cela
+ces
+ceux
+chaque
+ci
+comme
+comment
+dans
+des
+du
+dedans
+dehors
+depuis
+deux
+devrait
+doit
+donc
+dos
+droite
+début
+elle
+elles
+en
+encore
+essai
+est
+et
+eu
+fait
+faites
+fois
+font
+force
+haut
+hors
+ici
+il
+ils
+je  juste
+la
+le
+les
+leur
+là
+ma
+maintenant
+mais
+mes
+mine
+moins
+mon
+mot
+même
+ni
+nommés
+notre
+nous
+nouveaux
+ou
+où
+par
+parce
+parole
+pas
+personnes
+peut
+peu
+pièce
+plupart
+pour
+pourquoi
+quand
+que
+quel
+quelle
+quelles
+quels
+qui
+sa
+sans
+ses
+seulement
+si
+sien
+son
+sont
+sous
+soyez sujet
+sur
+ta
+tandis
+tellement
+tels
+tes
+ton
+tous
+tout
+trop
+très
+tu
+valeur
+voie
+voient
+vont
+votre
+vous
+vu
+ça
+étaient
+état
+étions
+été
+être

data/lib/svm_helper/version.rb CHANGED

@@ -1,3 +1,3 @@
 module SvmHelper
-  VERSION = "0.1.1"
+  VERSION = "0.2.1"
 end

data/spec/factories.rb CHANGED

@@ -18,14 +18,15 @@ FactoryGirl.define do
   factory :data, class: PreprocessedData do
-    data ["haus fooo garten baaz pferd fooo"]
+    data %w(haus fooo garten baaz pferd fooo)
     id 7
     label true
   end
   factory :data_w_short_words, parent: :data do
-    data ["auto foo pferd bz gooo fooo 2"]
+    data %w(auto pferd gooo fooo)
+    label false
   end
   factory :data_w_multiple_sections, parent: :data do
-    data ["meeh foo auto","bz baaz fooo 2"]
+    data [%w(meeh auto),%w(baaz fooo)]
   end
 end

data/spec/support/selector_spec.rb CHANGED

@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
       [0,1].should include(e)
     end
   end
-  it "should be able to process multiple data entries at once" do
-    selector.generate_vectors([data]).each do |e|
-      e.should == selector.generate_vector(data)
-    end
+  it "should respond to generate_vectors" do
+    selector.should respond_to(:generate_vectors)
   end
 end

data/spec/svm_helper/parallel_helper_spec.rb ADDED

@@ -0,0 +1,17 @@
+require 'spec_helper'
+require 'parallel'
+include ParallelHelper
+describe ParallelHelper do
+  let(:data) { (1..20).to_a }
+  context "parallel map" do
+    it "should return as a normal map" do
+      p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
+    end
+  end
+  context "parallel map with index" do
+    it "should return as a normal map with index" do
+      p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
+    end
+  end
+end

data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb RENAMED

File without changes

data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb RENAMED

@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
     end
   end
   context "#clean_title" do
     it "should be downcased" do
       job = FactoryGirl.build(:job_title_downcasing)
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
         FactoryGirl.build(:job_description_w_code_token),
         FactoryGirl.build(:job_description_w_gender) ]
     }
+    it "should call strip_stopwords" do
+      simple.expects(:strip_stopwords)
+      simple.clean_description(jobs[0][:description])
+    end
     it "should remove html/xml tags" do
-      desc = simple.clean_description(jobs[0][:description])
+      desc = simple.clean_description(jobs[0][:description]).join ' '
       desc.should_not match(/<(.*?)>/)
     end
     it "should remove new lines" do
-      desc = simple.clean_description(jobs[0][:description])
+      desc = simple.clean_description(jobs[0][:description]).join ' '
       desc.should_not match(/\r\n|\n|\r/)
     end
     it "should remove all special characters" do
-      desc = simple.clean_description(jobs[2][:description])
+      desc = simple.clean_description(jobs[2][:description]).join ' '
       desc.should_not match(/[^a-z öäü]/i)
     end
     it "should remove gender tokens" do
-      desc = simple.clean_description(jobs[3][:description])
+      desc = simple.clean_description(jobs[3][:description]).join ' '
       desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
     end
     it "should remove job code token" do
-      desc = simple.clean_description(jobs[4][:description])
+      desc = simple.clean_description(jobs[4][:description]).join ' '
       desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
     end
     it "should be downcased" do
-      desc = simple.clean_description(jobs[2][:description])
+      desc = simple.clean_description(jobs[2][:description]).join ' '
       desc.should_not match(/[^a-z öäü]/)
     end
   end
+  context "strip_stopwords" do
+    it "should remove words like 'and' from the text" do
+      simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
+    end
+  end
   context "parallel" do
       let(:parallel) { Preprocessor::Simple.new(parallel: true) }

data/spec/svm_helper/preprocessors/stemming_spec.rb ADDED

@@ -0,0 +1,11 @@
+require 'spec_helper'
+describe Preprocessor::Stemming do
+  it_behaves_like 'a preprocessor'
+  let(:preprocessor) { Preprocessor::Stemming.new }
+  let(:job) { FactoryGirl.build(:job) }
+  let(:jobs) { [job] }
+  it "should reduce words to their stem" do
+    preprocessor.clean_description("developer engineering").should == %w(develop engin)
+  end
+end

data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb ADDED

@@ -0,0 +1,35 @@
+require "spec_helper"
+describe Selector::BiNormalSeperation do
+  it_behaves_like 'a selector'
+  let(:bns) { Selector::BiNormalSeperation.new(:function) }
+  context "#extract_words_from_data" do
+    it "should generate a list of words from the data" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data))
+      words.should have(10).things
+    end
+    it "should remove words with 3 characters or less" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
+      words.should have(6).things
+    end
+    it "should process multiple sections in the data" do
+      words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
+      words.should have(6).things
+    end
+  end
+  context "#generate_global_dictionary" do
+    let(:data) { [FactoryGirl.build_list(:data,1),
+                  FactoryGirl.build_list(:data_w_short_words,4),
+                  FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
+    let(:words_per_data) { bns.extract_words(data,true) }
+    it "should return a list of n words" do
+      bns.generate_global_dictionary(words_per_data,2)
+      bns.global_dictionary.should have(2).things
+    end
+    it "should return a list of the n most used words in the data array" do
+      bns.generate_global_dictionary(words_per_data,3)
+      bns.global_dictionary.should eq(%w(fooo auto pferd))
+    end
+  end
+end

data/spec/svm_helper/selectors/bns_ig_spec.rb ADDED

@@ -0,0 +1,5 @@
+require "spec_helper"
+describe Selector::BNS_IG do
+  it_behaves_like 'a selector'
+end

data/spec/svm_helper/selectors/calc_spec.rb ADDED

@@ -0,0 +1,42 @@
+require 'spec_helper'
+# just some very basic test to make sure these functions do not fail
+describe "Calc" do
+  include Selector::IG
+  include Selector::BNS
+  let(:test_data){ [
+    [34, 23, 28, 17],
+    [31, 17, 23, 12],
+    [44, 39, 41, 36],
+    [44, 23, 41, 23],
+    [44, 39, 0, 36],
+    [44, 39, 41, 0],
+    [62, 81, 15, 73]
+  ]}
+  context Selector::IG do
+    it "should not fail" do
+      test_data.each do |data|
+        ->{information_gain(*data)}.should_not raise_error
+      end
+    end
+    it "should return some values" do
+      test_data.each do |data|
+        information_gain(*data).should be_a(Numeric)
+      end
+    end
+  end
+  context Selector::BNS do
+    it "should not fail" do
+      test_data.each do |data|
+        ->{bi_normal_seperation(*data)}.should_not raise_error
+      end
+    end
+    it "should return some values" do
+      test_data.each do |data|
+        bi_normal_seperation(*data).should be_a(Numeric)
+      end
+    end
+  end
+end

data/spec/svm_helper/selectors/information_gain_spec.rb ADDED

@@ -0,0 +1,5 @@
+require "spec_helper"
+describe Selector::InformationGain do
+  it_behaves_like 'a selector'
+end

data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb RENAMED

@@ -1,9 +1,7 @@
 require "spec_helper"
-describe Selector::NGram do
-  it_behaves_like 'a selector'
-  let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
+describe "n-grams" do
+  let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
   context "#extract_words_from_data" do
     it "should generate a list of words from the data" do
       words = ngram.extract_words_from_data(FactoryGirl.build(:data))

data/spec/{selectors → svm_helper/selectors}/simple_spec.rb RENAMED

@@ -7,9 +7,6 @@ describe Selector::Simple do
   it "should have select_feature_vector implemented" do
     expect { simple.generate_vectors([]) }.to_not raise_error
   end
-  context "#stopwords" do
-    it "simply loads them from a file"
-  end
   context "#extract_words_from_data" do
     it "should generate a list of words from the data" do
       words = simple.extract_words_from_data(FactoryGirl.build(:data))
@@ -50,6 +47,19 @@ describe Selector::Simple do
       simple.global_dictionary.should eq(%w(fooo auto baaz))
     end
   end
+  context "#build_dictionary" do
+    let(:data) { [FactoryGirl.build_list(:data,1),
+                  FactoryGirl.build_list(:data_w_short_words,2),
+                  FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
+    it "should return a list of n words" do
+      simple.build_dictionary(data,2)
+      simple.global_dictionary.should have(2).things
+    end
+    it "should return a list of the n most used words in the data array" do
+      simple.build_dictionary(data,3)
+      simple.global_dictionary.should eq(%w(fooo auto baaz))
+    end
+  end
   context "#generate_vector" do
     let(:dictionary) { %w(auto pferd haus hase garten) }
     let(:data) { FactoryGirl.build(:data) }
@@ -109,6 +119,7 @@ describe Selector::Simple do
     context "parallel" do
       let(:parallel) { Selector::Simple.new(:function, parallel: true) }
       before(:each) do
+        require 'parallel'
         simple.stubs(:global_dictionary).returns(dictionary)
         parallel.stubs(:global_dictionary).returns(dictionary)
       end
@@ -119,4 +130,4 @@ describe Selector::Simple do
       end
     end
   end
-end
+end

data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb RENAMED

@@ -1,8 +1,7 @@
 require "spec_helper"
-describe Selector::WithBinaryEncoding do
-  it_behaves_like 'a selector'
-  let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
+describe "binary encoded classification" do
+  let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
   let(:dictionary) { %w(auto pferd haus hase garten) }
   let(:data) { FactoryGirl.build(:data) }

data/svm_helper.gemspec CHANGED

@@ -17,5 +17,5 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.add_dependency('parallel', '~> 0.6.2')
+  gem.add_dependency "ruby-stemmer"
 end

metadata CHANGED

@@ -1,32 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: svm_helper
 version: !ruby/object:Gem::Version
-  version: 0.1.1
-  prerelease:
+  version: 0.2.1
 platform: ruby
 authors:
 - Andreas Eger
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-03-15 00:00:00.000000000 Z
+date: 2013-04-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: parallel
+  name: ruby-stemmer
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 0.6.2
+        version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 0.6.2
+        version: '0'
 description: Shared helper classes for usage in context of SVM at experteer
 email:
 - dev@eger-andreas.de
@@ -48,16 +45,21 @@ files:
 - lib/svm_helper.rb
 - lib/svm_helper/feature_vector.rb
 - lib/svm_helper/interface_helper.rb
+- lib/svm_helper/parallel_helper.rb
 - lib/svm_helper/preprocessed_data.rb
 - lib/svm_helper/preprocessors.rb
 - lib/svm_helper/preprocessors/id_mapping.rb
 - lib/svm_helper/preprocessors/simple.rb
+- lib/svm_helper/preprocessors/stemming.rb
 - lib/svm_helper/selectors.rb
-- lib/svm_helper/selectors/n_gram.rb
+- lib/svm_helper/selectors/bi_normal_seperation.rb
+- lib/svm_helper/selectors/bns_ig.rb
+- lib/svm_helper/selectors/calc.rb
+- lib/svm_helper/selectors/information_gain.rb
 - lib/svm_helper/selectors/simple.rb
-- lib/svm_helper/selectors/with_binary_encoding.rb
 - lib/svm_helper/stopwords/de
 - lib/svm_helper/stopwords/en
+- lib/svm_helper/stopwords/fr
 - lib/svm_helper/version.rb
 - spec/factories.rb
 - spec/factories/jobs/tmp.html
@@ -65,44 +67,43 @@ files:
 - spec/factories/jobs/tmp3.html
 - spec/factories/jobs_with_description.rb
 - spec/factories/jobs_with_title.rb
-- spec/preprocessors/id_mapping_spec.rb
-- spec/preprocessors/simple_spec.rb
-- spec/selectors/n_gram_spec.rb
-- spec/selectors/simple_spec.rb
-- spec/selectors/with_binary_encoding_spec.rb
 - spec/spec_helper.rb
 - spec/support/preprocessor_spec.rb
 - spec/support/selector_spec.rb
+- spec/svm_helper/parallel_helper_spec.rb
+- spec/svm_helper/preprocessors/id_mapping_spec.rb
+- spec/svm_helper/preprocessors/simple_spec.rb
+- spec/svm_helper/preprocessors/stemming_spec.rb
+- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
+- spec/svm_helper/selectors/bns_ig_spec.rb
+- spec/svm_helper/selectors/calc_spec.rb
+- spec/svm_helper/selectors/information_gain_spec.rb
+- spec/svm_helper/selectors/n_gram_spec.rb
+- spec/svm_helper/selectors/simple_spec.rb
+- spec/svm_helper/selectors/with_binary_encoding_spec.rb
 - svm_helper.gemspec
 homepage: https://github.com/sch1zo/svm_helper
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2037039748537332986
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2037039748537332986
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.25
+rubygems_version: 2.0.0.rc.2
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
 test_files:
 - spec/factories.rb
@@ -111,12 +112,18 @@ test_files:
 - spec/factories/jobs/tmp3.html
 - spec/factories/jobs_with_description.rb
 - spec/factories/jobs_with_title.rb
-- spec/preprocessors/id_mapping_spec.rb
-- spec/preprocessors/simple_spec.rb
-- spec/selectors/n_gram_spec.rb
-- spec/selectors/simple_spec.rb
-- spec/selectors/with_binary_encoding_spec.rb
 - spec/spec_helper.rb
 - spec/support/preprocessor_spec.rb
 - spec/support/selector_spec.rb
+- spec/svm_helper/parallel_helper_spec.rb
+- spec/svm_helper/preprocessors/id_mapping_spec.rb
+- spec/svm_helper/preprocessors/simple_spec.rb
+- spec/svm_helper/preprocessors/stemming_spec.rb
+- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
+- spec/svm_helper/selectors/bns_ig_spec.rb
+- spec/svm_helper/selectors/calc_spec.rb
+- spec/svm_helper/selectors/information_gain_spec.rb
+- spec/svm_helper/selectors/n_gram_spec.rb
+- spec/svm_helper/selectors/simple_spec.rb
+- spec/svm_helper/selectors/with_binary_encoding_spec.rb
 has_rdoc:

data/lib/svm_helper/selectors/n_gram.rb DELETED

@@ -1,31 +0,0 @@
-require_relative 'simple'
-module Selector
-  #
-  # Selector which uses a n-gram dictionary to generate feature vectors
-  #
-  # @author Andreas Eger
-  #
-  class NGram < Selector::Simple
-    attr_reader :gram_size
-    def initialize classification, args={}
-      super
-      @gram_size = args.fetch(:gram_size) { 2 }
-    end
-    def label
-      "ngram"
-    end
-    #
-    # fetches all words snippets from one data entry, removes stopwords and very short words
-    # @param  data [PreprocessedData]
-    # @param  gram_size [Integer] gram size
-    #
-    # @return [Array<String>]
-    def extract_words_from_data data, gram_size=@gram_size
-      (data.data.flat_map(&:split) - stopwords)
-                .delete_if { |e| e.size <= 3 }
-                .each_cons(gram_size).map{|e| e.join " " }
-    end
-  end
-end

data/lib/svm_helper/selectors/with_binary_encoding.rb DELETED

@@ -1,41 +0,0 @@
-require_relative 'simple'
-module Selector
-  #
-  # Selector which uses a n-gram dictionary to generate feature vectors
-  #
-  # @author Andreas Eger
-  #
-  class WithBinaryEncoding < Selector::Simple
-    CLASSIFICATIONS_SIZE = {
-          function: 8,        # max id 255, currently 19
-          industry: 16,       # max id 65535, currently 14370
-          career_level: 4 }   # max id 15, currently 8
-    def initialize *args
-      super
-    end
-    def label
-      "simple-WithBinaryEncoding"
-    end
-    private
-    #
-    # creates the classification specific part of the feature vector
-    # @param  ids [Hash] hash with classification ids
-    #
-    # @return [Array<Integer>] binary encoded classification id
-    def classification_array(id)
-      number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
-    end
-    def number_to_binary_array(number, size=8)
-      a=[]
-      (size-1).downto(0) do |i|
-        a<<number[i]
-      end
-      a
-    end
-  end
-end