RubyGems - svm_helper - Versions diffs - 0.1.0 - Mend

svm_helper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +7 -0
data/.gitignore +22 -0
data/.rspec +3 -0
data/.ruby-version +1 -0
data/.travis.yml +9 -0
data/.versions.conf +4 -0
data/.yardopts +3 -0
data/Gemfile +24 -0
data/Guardfile +17 -0
data/LICENSE.txt +22 -0
data/README.md +41 -0
data/Rakefile +7 -0
data/lib/svm_helper.rb +8 -0
data/lib/svm_helper/feature_vector.rb +17 -0
data/lib/svm_helper/interface_helper.rb +57 -0
data/lib/svm_helper/preprocessed_data.rb +17 -0
data/lib/svm_helper/preprocessors.rb +2 -0
data/lib/svm_helper/preprocessors/simple.rb +111 -0
data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
data/lib/svm_helper/selectors.rb +3 -0
data/lib/svm_helper/selectors/n_gram.rb +31 -0
data/lib/svm_helper/selectors/simple.rb +163 -0
data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
data/lib/svm_helper/stopwords/de +127 -0
data/lib/svm_helper/stopwords/en +119 -0
data/lib/svm_helper/version.rb +3 -0
data/spec/factories.rb +35 -0
data/spec/factories/jobs/tmp.html +42 -0
data/spec/factories/jobs/tmp2.html +20 -0
data/spec/factories/jobs/tmp3.html +34 -0
data/spec/factories/jobs_with_description.rb +20 -0
data/spec/factories/jobs_with_title.rb +72 -0
data/spec/preprocessors/simple_spec.rb +138 -0
data/spec/preprocessors/with_industry_map_spec.rb +16 -0
data/spec/selectors/n_gram_spec.rb +21 -0
data/spec/selectors/simple_spec.rb +121 -0
data/spec/selectors/with_binary_encoding_spec.rb +39 -0
data/spec/spec_helper.rb +14 -0
data/spec/support/preprocessor_spec.rb +21 -0
data/spec/support/selector_spec.rb +21 -0
data/svm_helper.gemspec +21 -0
metadata +112 -0

data/lib/svm_helper/selectors.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require_relative 'selectors/simple'
+require_relative 'selectors/n_gram'
+require_relative 'selectors/with_binary_encoding'

data/lib/svm_helper/selectors/n_gram.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require_relative 'simple'
+module Selector
+  #
+  # Selector which uses a n-gram dictionary to generate feature vectors
+  #
+  # @author Andreas Eger
+  #
+  class NGram < Selector::Simple
+    attr_reader :gram_size
+    def initialize args={}
+      super
+      @gram_size = args.fetch(:gram_size) { 2 }
+    end
+    def label
+      "ngram"
+    end
+    #
+    # fetches all words snippets from one data entry, removes stopwords and very short words
+    # @param  data [PreprocessedData]
+    # @param  gram_size [Integer] gram size
+    #
+    # @return [Array<String>]
+    def extract_words_from_data data, gram_size=@gram_size
+      (data.data.flat_map(&:split) - stopwords)
+                .delete_if { |e| e.size <= 3 }
+                .each_cons(gram_size).map{|e| e.join " " }
+    end
+  end
+end

data/lib/svm_helper/selectors/simple.rb ADDED Viewed

@@ -0,0 +1,163 @@
+module Selector
+  #
+  # Selector which uses a simple dictionary to generate feature vectors
+  #
+  # @author Andreas Eger
+  #
+  class Simple
+    THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
+    # stopword file
+    #TODO use File.expand_path
+    STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
+    # default dictionary size
+    DEFAULT_DICTIONARY_SIZE = 800
+    CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
+                            { function: Pjpp::Function.count,
+                              industry: Pjpp::Industry.count,
+                              career_level: Pjpp::CareerLevel.count }
+                          else
+                            { function: 19,       # 1..19
+                              industry: 632,      # 1..14370 but not all ids used
+                              career_level: 8 }   # 1..8
+                          end
+    attr_accessor :global_dictionary
+    def initialize args={}
+      @global_dictionary = args.fetch(:global_dictionary) {[]}
+      @language = args.fetch(:language){'en'}
+      @parallel = args.fetch(:parallel){false}
+    end
+    def label
+      "simple"
+    end
+    #
+    # generates a list of feature vetors and their labels from preprocessed data
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    # @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
+    # @param  dictionary_size [Integer] Size of a dictionary to create if non exists
+    #
+    # @return [Array<FeatureVector>] list of feature vectors and labels
+    def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE
+      words_per_data = extract_words data_set
+      generate_global_dictionary words_per_data, dictionary_size
+      make_vectors(words_per_data) do |words,index|
+        word_set = words.uniq
+        make_vector word_set, data_set[index], classification
+      end
+    end
+    #
+    # generates a feature vector with its label
+    # @param  data [PreprocessedData]
+    # @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
+    # @param  dictionary [Array] dictionary to use for this selection
+    #
+    # @return [FeatureVector]
+    def generate_vector data, classification=:function, dictionary=global_dictionary
+      word_set = Set.new extract_words_from_data(data)
+      make_vector word_set, data, classification, dictionary
+    end
+    #
+    # loads a txt file with stop words
+    # @param  location String folder with stopword lists
+    #
+    # @return [Array<String>] Array of stopwords
+    def stopwords(location=STOPWORD_LOCATION)
+      @stopwords ||= IO.read(File.join(location,@language)).split
+    end
+    #
+    # generates a list of words used as dictionary
+    # @param  all_words (see #extract_words)
+    # @param  size dictionary size
+    #
+    # @return [Array<String>] list of words
+    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
+      return unless global_dictionary.empty?
+      words = all_words.flatten.group_by{|e| e}.values
+               .sort_by{|e| e.size}
+               .map{|e| [e[0],e.size]}
+      @global_dictionary = words.last(size).map(&:first).reverse
+    end
+    #
+    # extracts the words of all provided data entries
+    # @param  data_set [Array<PreprocessedData>] list of preprocessed data
+    #
+    # @return [Array<Array<String>>] list of words per data entry
+    def extract_words data_set
+      data_set.map do |data|
+        extract_words_from_data data
+      end
+    end
+    #
+    # fetches all words from one data entry, removes stopwords and very short words
+    # @param  data [PreprocessedData] preprocessed data entry
+    #
+    # @return [Array<String>] list of words
+    def extract_words_from_data data
+      (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
+    end
+    def reset
+      @global_dictionary = []
+    end
+    private
+    #
+    # creates a feature vector for the given words, classification and dictionary
+    # also adds the label
+    # @param  words [Array<String>] list of words
+    # @param  data [PreprocessedData]
+    # @param  classification [Symbol] in `:industry`, `:function`, `:career_level`
+    # @param  dictionary
+    #
+    # @return [FeatureVector]
+    def make_vector words, data, classification, dictionary=global_dictionary
+      FeatureVector.new(
+        word_data: dictionary.map{|dic_word|
+                words.include?(dic_word) ? 1 : 0
+              },
+        classification_arrays: {
+          function: classification_array(data.ids, :function),
+          industry: classification_array(data.ids, :industry),
+          career_level: classification_array(data.ids, :career_level) },
+        labels: {
+          function: data.labels[:function] ? 1 : 0,
+          industry: data.labels[:industry] ? 1 : 0,
+          career_level: data.labels[:career_level] ? 1 : 0 }
+      ).tap{|e| e.send("#{classification}!")}
+    end
+    def make_vectors data, &block
+      if @parallel && RUBY_PLATFORM == 'java'
+        Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
+      elsif @parallel
+        Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
+      else
+        data.map.with_index {|e,i| yield e,i }
+      end
+    end
+    #
+    # creates the classification specific part of the feature vector
+    # @param  ids [Hash] hash with classification ids
+    #
+    # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
+    def classification_array(ids, classification)
+      id = ids[classification]
+      Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
+    end
+  end
+end

data/lib/svm_helper/selectors/with_binary_encoding.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require_relative 'simple'
+module Selector
+  #
+  # Selector which uses a n-gram dictionary to generate feature vectors
+  #
+  # @author Andreas Eger
+  #
+  class WithBinaryEncoding < Selector::Simple
+    CLASSIFICATIONS_SIZE = {
+          function: 8,        # max id 255, currently 19
+          industry: 16,       # max id 65535, currently 14370
+          career_level: 4 }   # max id 15, currently 8
+    def initialize args={}
+      super
+    end
+    def label
+      "simple-WithBinaryEncoding"
+    end
+    private
+    #
+    # creates the classification specific part of the feature vector
+    # @param  ids [Hash] hash with classification ids
+    #
+    # @return [Array<Integer>] binary encoded classification id
+    def classification_array(ids, classification)
+      id = ids[classification]
+      number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
+    end
+    def number_to_binary_array(number, size=8)
+      a=[]
+      (size-1).downto(0) do |i|
+        a<<number[i]
+      end
+      a
+    end
+  end
+end

data/lib/svm_helper/stopwords/de ADDED Viewed

@@ -0,0 +1,127 @@
+aber
+als
+am
+an
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+dadurch
+daher
+darum
+das
+daß
+dass
+dein
+deine
+dem
+den
+der
+des
+dessen
+deshalb
+die
+dies
+dieser
+dieses
+doch
+dort
+du
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+er
+es
+euer
+eure
+für
+hatte
+hatten
+hattest
+hattet
+hier  hinter
+ich
+ihr
+ihre
+im
+in
+ist
+ja
+jede
+jedem
+jeden
+jeder
+jedes
+jener
+jenes
+jetzt
+kann
+kannst
+können
+könnt
+machen
+mein
+meine
+mit
+muß
+mußt
+musst
+müssen
+müßt
+nach
+nachdem
+nein
+nicht
+nun
+oder
+seid
+sein
+seine
+sich
+sie
+sind
+soll
+sollen
+sollst
+sollt
+sonst
+soweit
+sowie
+und
+unser unsere
+unter
+vom
+von
+vor
+wann
+warum
+was
+weiter
+weitere
+wenn
+wer
+werde
+werden
+werdet
+weshalb
+wie
+wieder
+wieso
+wir
+wird
+wirst
+wo
+woher
+wohin
+zu
+zum
+zur
+über

data/lib/svm_helper/stopwords/en ADDED Viewed

@@ -0,0 +1,119 @@
+a
+able
+about
+across
+after
+all
+almost
+also
+am
+among
+an
+and
+any
+are
+as
+at
+be
+because
+been
+but
+by
+can
+cannot
+could
+dear
+did
+do
+does
+either
+else
+ever
+every
+for
+from
+get
+got
+had
+has
+have
+he
+her
+hers
+him
+his
+how
+however
+i
+if
+in
+into
+is
+it
+its
+just
+least
+let
+like
+likely
+may
+me
+might
+most
+must
+my
+neither
+no
+nor
+not
+of
+off
+often
+on
+only
+or
+other
+our
+own
+rather
+said
+say
+says
+she
+should
+since
+so
+some
+than
+that
+the
+their
+them
+then
+there
+these
+they
+this
+tis
+to
+too
+twas
+us
+wants
+was
+we
+were
+what
+when
+where
+which
+while
+who
+whom
+why
+will
+with
+would
+yet
+you
+your