svm_helper 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +1 -0
  3. data/Guardfile +1 -1
  4. data/lib/svm_helper.rb +1 -2
  5. data/lib/svm_helper/parallel_helper.rb +24 -0
  6. data/lib/svm_helper/preprocessors.rb +1 -0
  7. data/lib/svm_helper/preprocessors/simple.rb +31 -22
  8. data/lib/svm_helper/preprocessors/stemming.rb +31 -0
  9. data/lib/svm_helper/selectors.rb +4 -2
  10. data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
  11. data/lib/svm_helper/selectors/bns_ig.rb +50 -0
  12. data/lib/svm_helper/selectors/calc.rb +71 -0
  13. data/lib/svm_helper/selectors/information_gain.rb +49 -0
  14. data/lib/svm_helper/selectors/simple.rb +80 -40
  15. data/lib/svm_helper/stopwords/fr +124 -0
  16. data/lib/svm_helper/version.rb +1 -1
  17. data/spec/factories.rb +4 -3
  18. data/spec/support/selector_spec.rb +2 -4
  19. data/spec/svm_helper/parallel_helper_spec.rb +17 -0
  20. data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
  21. data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
  22. data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
  23. data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
  24. data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
  25. data/spec/svm_helper/selectors/calc_spec.rb +42 -0
  26. data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
  27. data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
  28. data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
  29. data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
  30. data/svm_helper.gemspec +1 -1
  31. metadata +39 -32
  32. data/lib/svm_helper/selectors/n_gram.rb +0 -31
  33. data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
4
+ data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
5
+ SHA512:
6
+ metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
7
+ data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8
data/Gemfile CHANGED
@@ -21,4 +21,5 @@ group :test do
21
21
  gem 'rake'
22
22
  gem 'mocha', require: 'mocha/api'
23
23
  gem 'factory_girl', '~> 4.0'
24
+ gem 'parallel', require: false
24
25
  end
data/Guardfile CHANGED
@@ -1,7 +1,7 @@
1
1
  guard 'rspec', cli: "--color --format p", all_after_pass: false do
2
2
  # guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
3
3
  watch(%r{^spec/.+_spec\.rb$})
4
- watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
5
5
  watch('spec/spec_helper.rb') { 'spec' }
6
6
  watch('spec/factories.rb') { 'spec' }
7
7
  watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
@@ -1,8 +1,7 @@
1
1
  require "svm_helper/version"
2
- require 'parallel'
3
2
 
3
+ require "svm_helper/parallel_helper"
4
4
  require "svm_helper/preprocessed_data"
5
5
  require "svm_helper/feature_vector"
6
6
  require "svm_helper/preprocessors"
7
7
  require "svm_helper/selectors"
8
-
@@ -0,0 +1,24 @@
1
+ module ParallelHelper
2
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
3
+ def p_map_with_index data, &block
4
+ if parallel? && RUBY_PLATFORM == 'java'
5
+ Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
6
+ elsif parallel?
7
+ Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
8
+ else
9
+ data.map.with_index {|e,i| yield e,i }
10
+ end
11
+ end
12
+ def p_map data, &block
13
+ if parallel? && RUBY_PLATFORM == 'java'
14
+ Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
15
+ elsif parallel?
16
+ Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
17
+ else
18
+ data.map {|e| yield e }
19
+ end
20
+ end
21
+ def parallel?
22
+ defined?(Parallel) == 'constant' && @parallel
23
+ end
24
+ end
@@ -1,2 +1,3 @@
1
1
  require_relative 'preprocessors/simple'
2
+ require_relative 'preprocessors/stemming'
2
3
  require_relative 'preprocessors/id_mapping'
@@ -6,7 +6,7 @@ module Preprocessor
6
6
  # @author Andreas Eger
7
7
  #
8
8
  class Simple
9
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
+ include ::ParallelHelper
10
10
  # filters most gender stuff
11
11
  GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
12
12
  # filters most wierd symbols
@@ -25,8 +25,16 @@ module Preprocessor
25
25
  # filter for used job tokens
26
26
  CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
27
27
 
28
+ # stopword file
29
+ #TODO use File.expand_path
30
+ STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
31
+ attr_accessor :language
32
+
33
+
28
34
  def initialize args={}
35
+ @language = args.fetch(:language){'en'}
29
36
  @parallel = args.fetch(:parallel){false}
37
+ @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
30
38
  end
31
39
 
32
40
  def label
@@ -48,12 +56,20 @@ module Preprocessor
48
56
  # @return [Array<PreprocessedData>] list of processed job data - or singe job data
49
57
  def process jobs
50
58
  if jobs.is_a? Array
51
- process_jobs jobs
59
+ p_map(jobs) {|job| process_job job }
52
60
  else
53
61
  process_job jobs
54
62
  end
55
63
  end
56
64
 
65
+ #
66
+ # loads a txt file with stop words
67
+ # @param location String folder with stopword lists
68
+ #
69
+ # @return [Array<String>] Array of stopwords
70
+ def strip_stopwords(text)
71
+ (text.split - @stopwords).delete_if { |e| e.size <= 2 }
72
+ end
57
73
 
58
74
  #
59
75
  # converts string into a cleaner version
@@ -75,29 +91,22 @@ module Preprocessor
75
91
  #
76
92
  # @return [String] clean and lowercase version of input
77
93
  def clean_description desc
78
- desc.gsub(XML_TAG_FILTER,' ')
79
- .gsub(EMAIL_FILTER,'')
80
- .gsub(URL_FILTER,'')
81
- .gsub(GENDER_FILTER,'')
82
- .gsub(NEW_LINES,'')
83
- .gsub(SYMBOL_FILTER,' ')
84
- .gsub(WHITESPACE,' ')
85
- .gsub(WORDS_IN_BRACKETS, '\1')
86
- .gsub(CODE_TOKEN_FILTER,'')
87
- .downcase
88
- .strip
94
+ strip_stopwords(
95
+ desc.gsub(XML_TAG_FILTER,' ')
96
+ .gsub(EMAIL_FILTER,'')
97
+ .gsub(URL_FILTER,'')
98
+ .gsub(GENDER_FILTER,'')
99
+ .gsub(NEW_LINES,'')
100
+ .gsub(SYMBOL_FILTER,' ')
101
+ .gsub(WHITESPACE,' ')
102
+ .gsub(WORDS_IN_BRACKETS, '\1')
103
+ .gsub(CODE_TOKEN_FILTER,'')
104
+ .downcase
105
+ .strip
106
+ )
89
107
  end
90
108
 
91
109
  private
92
- def process_jobs jobs
93
- if @parallel && RUBY_PLATFORM == 'java'
94
- Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
95
- elsif @parallel
96
- Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
97
- else
98
- jobs.map {|job| process_job job }
99
- end
100
- end
101
110
 
102
111
  def process_job job
103
112
  PreprocessedData.new(
@@ -0,0 +1,31 @@
1
+ require_relative 'simple'
2
+ require 'lingua/stemmer'
3
+ module Preprocessor
4
+ #
5
+ # Preprocessor Base Class
6
+ #
7
+ # @author Andreas Eger
8
+ #
9
+ class Stemming < Simple
10
+
11
+ def initialize(args={})
12
+ super
13
+ @stemmer = Lingua::Stemmer.new(language: @language)
14
+ end
15
+ def label
16
+ "with_stemming"
17
+ end
18
+
19
+ def clean_description desc
20
+ super.map{|w| @stemmer.stem(w) }
21
+ end
22
+ private
23
+ def process_job job
24
+ PreprocessedData.new(
25
+ data: [clean_title(job[:title]), clean_description(job[:description])],
26
+ id: job[:id],
27
+ label: job[:label]
28
+ )
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,5 @@
1
1
  require_relative 'selectors/simple'
2
- require_relative 'selectors/n_gram'
3
- require_relative 'selectors/with_binary_encoding'
2
+ require_relative 'selectors/calc'
3
+ require_relative 'selectors/bi_normal_seperation'
4
+ require_relative 'selectors/information_gain'
5
+ require_relative 'selectors/bns_ig'
@@ -0,0 +1,86 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class BiNormalSeperation < Selector::Simple
8
+ include BNS
9
+
10
+ def label
11
+ "BiNormalSeperation"
12
+ end
13
+
14
+ def initialize classification, args={}
15
+ super
16
+ @word_selection = args.fetch(:word_selection){ :grams1_2 }
17
+ end
18
+ #
19
+ # generates a list of feature vetors and their labels from preprocessed data
20
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
21
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
22
+ # @param dictionary_size [Integer] Size of a dictionary to create if non exists
23
+ #
24
+ # @return [Array<FeatureVector>] list of feature vectors and labels
25
+ def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
26
+ words_and_label_per_data = extract_words data_set, true
27
+ generate_global_dictionary words_and_label_per_data, dictionary_size
28
+
29
+ words_per_data = words_and_label_per_data.map(&:features)
30
+ p_map_with_index(words_per_data) do |words,index|
31
+ word_set = words.uniq
32
+ make_vector word_set, data_set[index]
33
+ end
34
+ end
35
+
36
+ #
37
+ # generates a list of words used as dictionary
38
+ # @param all_words (see #extract_words)
39
+ # @param size dictionary size
40
+ #
41
+ # @return [Array<String>] list of words
42
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
43
+ return unless global_dictionary.empty?
44
+
45
+ label_counts = [0,0]
46
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
47
+ label = bag.label ? 1 : 0
48
+ label_counts[label] += 1
49
+ # only count a feature once per bag
50
+ bag.features.uniq.each do |word|
51
+ unless accumulator.has_key?(word)
52
+ accumulator[word] = [0,0]
53
+ end
54
+ accumulator[word][label] += 1
55
+ end
56
+ accumulator
57
+ end
58
+ neg, pos = label_counts
59
+ words = p_map(features) do |word, counts|
60
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
61
+ bns = bi_normal_seperation(pos, neg, *counts)
62
+ [word, bns.abs]
63
+ end
64
+ @global_dictionary = words.compact
65
+ .sort_by{|e| e[1]}
66
+ .last(size)
67
+ .map{|e| e[0] }
68
+ end
69
+
70
+ def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
71
+ words_per_data = extract_words data_set, true
72
+ generate_global_dictionary words_per_data, dictionary_size
73
+ end
74
+ #
75
+ # extracts the words of all provided data entries
76
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
77
+ # @param keep_label
78
+ #
79
+ # @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
80
+ def extract_words data_set, keep_label=false
81
+ data_set.map do |data|
82
+ extract_words_from_data data, keep_label
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,50 @@
1
+ require_relative 'bi_normal_seperation'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class BNS_IG < Selector::BiNormalSeperation
8
+ include IG
9
+
10
+ def label
11
+ "BiNormalSeperation_InformationGain"
12
+ end
13
+
14
+ #
15
+ # generates a list of words used as dictionary
16
+ # @param all_words (see #extract_words)
17
+ # @param size dictionary size
18
+ #
19
+ # @return [Array<String>] list of words
20
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
21
+ return unless global_dictionary.empty?
22
+
23
+ label_counts = [0,0]
24
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
25
+ label = bag.label ? 1 : 0
26
+ label_counts[label] += 1
27
+ # only count a feature once per bag
28
+ bag.features.uniq.each do |word|
29
+ unless accumulator.has_key?(word)
30
+ accumulator[word] = [0,0]
31
+ end
32
+ accumulator[word][label] += 1
33
+ end
34
+ accumulator
35
+ end
36
+ neg, pos = label_counts
37
+ words = p_map(features) do |word, counts|
38
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
39
+ bns = bi_normal_seperation(pos, neg, *counts)
40
+ ig = information_gain(pos, neg, *counts)
41
+ # use geometric mean of BNS and IG
42
+ [word, Math.sqrt(bns.abs * ig.abs)]
43
+ end
44
+ @global_dictionary = words.compact
45
+ .sort_by{|e| e[1]}
46
+ .last(size)
47
+ .map{|e| e[0] }
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,71 @@
1
+ module Selector
2
+ module IG
3
+ def information_gain(pos, neg, tp, fp)
4
+ fn = neg - fp
5
+ tn = pos - tp
6
+ p_word = (tp + fp).quo(pos + neg)
7
+
8
+ e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
9
+ end
10
+ def e(x,y)
11
+ -xlx(x.quo(x+y)) -xlx(y.quo(x+y))
12
+ end
13
+ def xlx(x)
14
+ x * Math.log2(x)
15
+ end
16
+ end
17
+ module BNS
18
+ SQR2 = Math.sqrt(2)
19
+ SQR2PI = Math.sqrt(2.0*Math::PI)
20
+
21
+ def bi_normal_seperation pos, neg, tp, fp
22
+ false_prositive_rate = fp.quo(neg)
23
+ true_prositive_rate = tp.quo(pos)
24
+ bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
25
+ end
26
+ # standard normal cumulative distribution function
27
+ def cdf(z)
28
+ 0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
29
+ end
30
+
31
+ # inverse standard normal cumulative distribution function
32
+ # http://home.online.no/~pjacklam/notes/invnorm
33
+
34
+ # Coefficients in rational approximations.
35
+ A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
36
+ B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
37
+ C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
38
+ D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
39
+ # Define break-points.
40
+ P_LOW = 0.02425
41
+ P_HIGH = 1.0 - P_LOW
42
+
43
+ def cdf_inverse(p)
44
+ return 0.0 if p < 0 || p > 1 || p == 0.5
45
+ x = 0.0
46
+
47
+ if 0.0 < p && p < P_LOW
48
+ # Rational approximation for lower region.
49
+ q = Math.sqrt(-2.0*Math.log(p))
50
+ x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
51
+ ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
52
+ elsif P_LOW <= p && p <= P_HIGH
53
+ # Rational approximation for central region.
54
+ q = p - 0.5
55
+ r = q*q
56
+ x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
57
+ (((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
58
+ elsif P_HIGH < p && p < 1.0
59
+ # Rational approximation for upper region.
60
+ q = Math.sqrt(-2.0*Math.log(1.0-p))
61
+ x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
62
+ ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
63
+ end
64
+ if 0 < p && p < 1
65
+ u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
66
+ x = x - u/(1.0 + x*u/2.0)
67
+ end
68
+ x
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,49 @@
1
+ require_relative 'bi_normal_seperation'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class InformationGain < Selector::BiNormalSeperation
8
+ include IG
9
+
10
+ def label
11
+ "InformationGain"
12
+ end
13
+
14
+ #
15
+ # generates a list of words used as dictionary
16
+ # @param all_words (see #extract_words)
17
+ # @param size dictionary size
18
+ #
19
+ # @return [Array<String>] list of words
20
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
21
+ return unless global_dictionary.empty?
22
+
23
+ label_counts = [0,0]
24
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
25
+ label = bag.label ? 1 : 0
26
+ label_counts[label] += 1
27
+ # only count a feature once per bag
28
+ bag.features.uniq.each do |word|
29
+ unless accumulator.has_key?(word)
30
+ accumulator[word] = [0,0]
31
+ end
32
+ accumulator[word][label] += 1
33
+ end
34
+ accumulator
35
+ end
36
+ neg, pos = label_counts
37
+ words = p_map(features) do |word, counts|
38
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
39
+ tp, fp = counts
40
+ ig = information_gain(pos, neg, tp, fp)
41
+ [word, ig.abs]
42
+ end
43
+ @global_dictionary = words.compact
44
+ .sort_by{|e| e[1]}
45
+ .last(size)
46
+ .map{|e| e[0] }
47
+ end
48
+ end
49
+ end
@@ -5,31 +5,20 @@ module Selector
5
5
  # @author Andreas Eger
6
6
  #
7
7
  class Simple
8
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
- # stopword file
10
- #TODO use File.expand_path
11
- STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
8
+ include ::ParallelHelper
12
9
  # default dictionary size
13
10
  DEFAULT_DICTIONARY_SIZE = 800
14
11
 
15
- CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
16
- { function: Pjpp::Function.count,
17
- industry: Pjpp::Industry.count,
18
- career_level: Pjpp::CareerLevel.count }
19
- else
20
- { function: 19, # 1..19
21
- industry: 632, # 1..14370 but not all ids used
22
- career_level: 8 } # 1..8
23
- end
24
-
25
-
26
-
27
12
  attr_accessor :global_dictionary
28
-
13
+ attr_reader :classification_encoding,
14
+ :gram_size,
15
+ :word_selection
29
16
  def initialize classification, args={}
30
17
  @classification = classification
31
18
  @global_dictionary = args.fetch(:global_dictionary) {[]}
32
- @language = args.fetch(:language){'en'}
19
+ @classification_encoding = args.fetch(:classification_encoding){:bitmap}
20
+ @word_selection = args.fetch(:word_selection){ :single }
21
+ @gram_size = args.fetch(:gram_size) { 1 }
33
22
  @parallel = args.fetch(:parallel){false}
34
23
  end
35
24
 
@@ -48,7 +37,7 @@ module Selector
48
37
  words_per_data = extract_words data_set
49
38
  generate_global_dictionary words_per_data, dictionary_size
50
39
 
51
- make_vectors(words_per_data) do |words,index|
40
+ p_map_with_index(words_per_data) do |words,index|
52
41
  word_set = words.uniq
53
42
  make_vector word_set, data_set[index]
54
43
  end
@@ -66,15 +55,6 @@ module Selector
66
55
  make_vector word_set, data, dictionary
67
56
  end
68
57
 
69
- #
70
- # loads a txt file with stop words
71
- # @param location String folder with stopword lists
72
- #
73
- # @return [Array<String>] Array of stopwords
74
- def stopwords(location=STOPWORD_LOCATION)
75
- @stopwords ||= IO.read(File.join(location,@language)).split
76
- end
77
-
78
58
  #
79
59
  # generates a list of words used as dictionary
80
60
  # @param all_words (see #extract_words)
@@ -90,6 +70,10 @@ module Selector
90
70
  @global_dictionary = words.last(size).map(&:first).reverse
91
71
  end
92
72
 
73
+ def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
74
+ words_per_data = extract_words data_set
75
+ generate_global_dictionary words_per_data, dictionary_size
76
+ end
93
77
  #
94
78
  # extracts the words of all provided data entries
95
79
  # @param data_set [Array<PreprocessedData>] list of preprocessed data
@@ -107,7 +91,46 @@ module Selector
107
91
  #
108
92
  # @return [Array<String>] list of words
109
93
  def extract_words_from_data data
110
- (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
94
+ words = (data.data.flat_map(&:split) - stopwords)
95
+ .delete_if { |e| e.size <= 2 }
96
+ if gram_size > 1
97
+ words = words.each_cons(@gram_size).map{|e| e.join " " }
98
+ end
99
+ words
100
+ end
101
+
102
+ #
103
+ # fetches all words and two word phrases from one data entry, removes stopwords and very short words
104
+ # @param data [PreprocessedData] preprocessed data entry
105
+ # @param keep_label
106
+ #
107
+ # @return [OpenStruct<Array<String>,Boolean>] list of words
108
+ def extract_words_from_data data, keep_label=false
109
+ # assume the first token is the title an preserve it
110
+ title, *words = data.data.flatten
111
+ features = case word_selection
112
+ when :grams
113
+ words.each_cons(@gram_size).map{|e| e.join " " }
114
+ when :grams1_2
115
+ words + words.each_cons(2).map{|e| e.join " " }
116
+ when :grams1_2_3
117
+ words +
118
+ words.each_cons(2).map{|e| e.join " " } +
119
+ words.each_cons(3).map{|e| e.join " " }
120
+ when :grams1_2_3_4
121
+ words +
122
+ words.each_cons(2).map{|e| e.join " " } +
123
+ words.each_cons(3).map{|e| e.join " " } +
124
+ words.each_cons(4).map{|e| e.join " " }
125
+ else
126
+ words
127
+ end
128
+ features.unshift(title)
129
+ return features unless keep_label
130
+ OpenStruct.new(
131
+ features: features,
132
+ label: data.label
133
+ )
111
134
  end
112
135
 
113
136
  def reset classification
@@ -135,23 +158,40 @@ module Selector
135
158
  )
136
159
  end
137
160
 
138
- def make_vectors data, &block
139
- if @parallel && RUBY_PLATFORM == 'java'
140
- Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
141
- elsif @parallel
142
- Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
143
- else
144
- data.map.with_index {|e,i| yield e,i }
145
- end
146
- end
161
+ BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
162
+ { function: Pjpp::Function.count,
163
+ industry: Pjpp::Industry.count,
164
+ career_level: Pjpp::CareerLevel.count }
165
+ else
166
+ { function: 19, # 1..19
167
+ industry: 632, # 1..14370 but not all ids used
168
+ career_level: 8 } # 1..8
169
+ end
147
170
 
171
+ BINARY_ARRAY_SIZES = {
172
+ function: 8, # max id 255, currently 19
173
+ industry: 16, # max id 65535, currently 14370
174
+ career_level: 4 } # max id 15, currently 8
148
175
  #
149
176
  # creates the classification specific part of the feature vector
150
177
  # @param ids [Hash] hash with classification ids
151
178
  #
152
179
  # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
153
180
  def classification_array(id)
154
- Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
181
+ case @classification_encoding
182
+ when :binary
183
+ number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
184
+ else # :bitmap
185
+ Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
186
+ end
187
+ end
188
+
189
+ def number_to_binary_array(number, size=8)
190
+ a=[]
191
+ (size-1).downto(0) do |i|
192
+ a<<number[i]
193
+ end
194
+ a
155
195
  end
156
196
  end
157
- end
197
+ end
@@ -0,0 +1,124 @@
1
+ alors
2
+ au
3
+ aucuns
4
+ aussi
5
+ autre
6
+ avant
7
+ avec
8
+ avoir
9
+ bon
10
+ car
11
+ ce
12
+ cela
13
+ ces
14
+ ceux
15
+ chaque
16
+ ci
17
+ comme
18
+ comment
19
+ dans
20
+ des
21
+ du
22
+ dedans
23
+ dehors
24
+ depuis
25
+ deux
26
+ devrait
27
+ doit
28
+ donc
29
+ dos
30
+ droite
31
+ début
32
+ elle
33
+ elles
34
+ en
35
+ encore
36
+ essai
37
+ est
38
+ et
39
+ eu
40
+ fait
41
+ faites
42
+ fois
43
+ font
44
+ force
45
+ haut
46
+ hors
47
+ ici
48
+ il
49
+ ils
50
+ je juste
51
+ la
52
+ le
53
+ les
54
+ leur
55
+
56
+ ma
57
+ maintenant
58
+ mais
59
+ mes
60
+ mine
61
+ moins
62
+ mon
63
+ mot
64
+ même
65
+ ni
66
+ nommés
67
+ notre
68
+ nous
69
+ nouveaux
70
+ ou
71
+
72
+ par
73
+ parce
74
+ parole
75
+ pas
76
+ personnes
77
+ peut
78
+ peu
79
+ pièce
80
+ plupart
81
+ pour
82
+ pourquoi
83
+ quand
84
+ que
85
+ quel
86
+ quelle
87
+ quelles
88
+ quels
89
+ qui
90
+ sa
91
+ sans
92
+ ses
93
+ seulement
94
+ si
95
+ sien
96
+ son
97
+ sont
98
+ sous
99
+ soyez sujet
100
+ sur
101
+ ta
102
+ tandis
103
+ tellement
104
+ tels
105
+ tes
106
+ ton
107
+ tous
108
+ tout
109
+ trop
110
+ très
111
+ tu
112
+ valeur
113
+ voie
114
+ voient
115
+ vont
116
+ votre
117
+ vous
118
+ vu
119
+ ça
120
+ étaient
121
+ état
122
+ étions
123
+ été
124
+ être
@@ -1,3 +1,3 @@
1
1
  module SvmHelper
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -18,14 +18,15 @@ FactoryGirl.define do
18
18
 
19
19
 
20
20
  factory :data, class: PreprocessedData do
21
- data ["haus fooo garten baaz pferd fooo"]
21
+ data %w(haus fooo garten baaz pferd fooo)
22
22
  id 7
23
23
  label true
24
24
  end
25
25
  factory :data_w_short_words, parent: :data do
26
- data ["auto foo pferd bz gooo fooo 2"]
26
+ data %w(auto pferd gooo fooo)
27
+ label false
27
28
  end
28
29
  factory :data_w_multiple_sections, parent: :data do
29
- data ["meeh foo auto","bz baaz fooo 2"]
30
+ data [%w(meeh auto),%w(baaz fooo)]
30
31
  end
31
32
  end
@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
13
13
  [0,1].should include(e)
14
14
  end
15
15
  end
16
- it "should be able to process multiple data entries at once" do
17
- selector.generate_vectors([data]).each do |e|
18
- e.should == selector.generate_vector(data)
19
- end
16
+ it "should respond to generate_vectors" do
17
+ selector.should respond_to(:generate_vectors)
20
18
  end
21
19
  end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+ require 'parallel'
3
+
4
+ include ParallelHelper
5
+ describe ParallelHelper do
6
+ let(:data) { (1..20).to_a }
7
+ context "parallel map" do
8
+ it "should return as a normal map" do
9
+ p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
10
+ end
11
+ end
12
+ context "parallel map with index" do
13
+ it "should return as a normal map with index" do
14
+ p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
15
+ end
16
+ end
17
+ end
@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
41
41
  end
42
42
  end
43
43
 
44
+
44
45
  context "#clean_title" do
45
46
  it "should be downcased" do
46
47
  job = FactoryGirl.build(:job_title_downcasing)
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
75
76
  FactoryGirl.build(:job_description_w_code_token),
76
77
  FactoryGirl.build(:job_description_w_gender) ]
77
78
  }
79
+ it "should call strip_stopwords" do
80
+ simple.expects(:strip_stopwords)
81
+ simple.clean_description(jobs[0][:description])
82
+ end
78
83
  it "should remove html/xml tags" do
79
- desc = simple.clean_description(jobs[0][:description])
84
+ desc = simple.clean_description(jobs[0][:description]).join ' '
80
85
  desc.should_not match(/<(.*?)>/)
81
86
  end
82
87
  it "should remove new lines" do
83
- desc = simple.clean_description(jobs[0][:description])
88
+ desc = simple.clean_description(jobs[0][:description]).join ' '
84
89
  desc.should_not match(/\r\n|\n|\r/)
85
90
  end
86
91
  it "should remove all special characters" do
87
- desc = simple.clean_description(jobs[2][:description])
92
+ desc = simple.clean_description(jobs[2][:description]).join ' '
88
93
  desc.should_not match(/[^a-z öäü]/i)
89
94
  end
90
95
  it "should remove gender tokens" do
91
- desc = simple.clean_description(jobs[3][:description])
96
+ desc = simple.clean_description(jobs[3][:description]).join ' '
92
97
  desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
93
98
  end
94
99
  it "should remove job code token" do
95
- desc = simple.clean_description(jobs[4][:description])
100
+ desc = simple.clean_description(jobs[4][:description]).join ' '
96
101
  desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
97
102
  end
98
103
  it "should be downcased" do
99
- desc = simple.clean_description(jobs[2][:description])
104
+ desc = simple.clean_description(jobs[2][:description]).join ' '
100
105
  desc.should_not match(/[^a-z öäü]/)
101
106
  end
102
107
  end
108
+
109
+ context "strip_stopwords" do
110
+ it "should remove words like 'and' from the text" do
111
+ simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
112
+ end
113
+ end
103
114
  context "parallel" do
104
115
  let(:parallel) { Preprocessor::Simple.new(parallel: true) }
105
116
 
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe Preprocessor::Stemming do
4
+ it_behaves_like 'a preprocessor'
5
+ let(:preprocessor) { Preprocessor::Stemming.new }
6
+ let(:job) { FactoryGirl.build(:job) }
7
+ let(:jobs) { [job] }
8
+ it "should reduce words to their stem" do
9
+ preprocessor.clean_description("developer engineering").should == %w(develop engin)
10
+ end
11
+ end
@@ -0,0 +1,35 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::BiNormalSeperation do
4
+ it_behaves_like 'a selector'
5
+
6
+ let(:bns) { Selector::BiNormalSeperation.new(:function) }
7
+ context "#extract_words_from_data" do
8
+ it "should generate a list of words from the data" do
9
+ words = bns.extract_words_from_data(FactoryGirl.build(:data))
10
+ words.should have(10).things
11
+ end
12
+ it "should remove words with 3 characters or less" do
13
+ words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
14
+ words.should have(6).things
15
+ end
16
+ it "should process multiple sections in the data" do
17
+ words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
18
+ words.should have(6).things
19
+ end
20
+ end
21
+ context "#generate_global_dictionary" do
22
+ let(:data) { [FactoryGirl.build_list(:data,1),
23
+ FactoryGirl.build_list(:data_w_short_words,4),
24
+ FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
25
+ let(:words_per_data) { bns.extract_words(data,true) }
26
+ it "should return a list of n words" do
27
+ bns.generate_global_dictionary(words_per_data,2)
28
+ bns.global_dictionary.should have(2).things
29
+ end
30
+ it "should return a list of the n most used words in the data array" do
31
+ bns.generate_global_dictionary(words_per_data,3)
32
+ bns.global_dictionary.should eq(%w(fooo auto pferd))
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,5 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::BNS_IG do
4
+ it_behaves_like 'a selector'
5
+ end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+
3
+ # just some very basic test to make sure these functions do not fail
4
+ describe "Calc" do
5
+ include Selector::IG
6
+ include Selector::BNS
7
+ let(:test_data){ [
8
+ [34, 23, 28, 17],
9
+ [31, 17, 23, 12],
10
+ [44, 39, 41, 36],
11
+ [44, 23, 41, 23],
12
+ [44, 39, 0, 36],
13
+ [44, 39, 41, 0],
14
+ [62, 81, 15, 73]
15
+ ]}
16
+
17
+ context Selector::IG do
18
+ it "should not fail" do
19
+ test_data.each do |data|
20
+ ->{information_gain(*data)}.should_not raise_error
21
+ end
22
+ end
23
+ it "should return some values" do
24
+ test_data.each do |data|
25
+ information_gain(*data).should be_a(Numeric)
26
+ end
27
+ end
28
+ end
29
+
30
+ context Selector::BNS do
31
+ it "should not fail" do
32
+ test_data.each do |data|
33
+ ->{bi_normal_seperation(*data)}.should_not raise_error
34
+ end
35
+ end
36
+ it "should return some values" do
37
+ test_data.each do |data|
38
+ bi_normal_seperation(*data).should be_a(Numeric)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,5 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::InformationGain do
4
+ it_behaves_like 'a selector'
5
+ end
@@ -1,9 +1,7 @@
1
1
  require "spec_helper"
2
2
 
3
- describe Selector::NGram do
4
- it_behaves_like 'a selector'
5
-
6
- let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
3
+ describe "n-grams" do
4
+ let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
7
5
  context "#extract_words_from_data" do
8
6
  it "should generate a list of words from the data" do
9
7
  words = ngram.extract_words_from_data(FactoryGirl.build(:data))
@@ -7,9 +7,6 @@ describe Selector::Simple do
7
7
  it "should have select_feature_vector implemented" do
8
8
  expect { simple.generate_vectors([]) }.to_not raise_error
9
9
  end
10
- context "#stopwords" do
11
- it "simply loads them from a file"
12
- end
13
10
  context "#extract_words_from_data" do
14
11
  it "should generate a list of words from the data" do
15
12
  words = simple.extract_words_from_data(FactoryGirl.build(:data))
@@ -50,6 +47,19 @@ describe Selector::Simple do
50
47
  simple.global_dictionary.should eq(%w(fooo auto baaz))
51
48
  end
52
49
  end
50
+ context "#build_dictionary" do
51
+ let(:data) { [FactoryGirl.build_list(:data,1),
52
+ FactoryGirl.build_list(:data_w_short_words,2),
53
+ FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
54
+ it "should return a list of n words" do
55
+ simple.build_dictionary(data,2)
56
+ simple.global_dictionary.should have(2).things
57
+ end
58
+ it "should return a list of the n most used words in the data array" do
59
+ simple.build_dictionary(data,3)
60
+ simple.global_dictionary.should eq(%w(fooo auto baaz))
61
+ end
62
+ end
53
63
  context "#generate_vector" do
54
64
  let(:dictionary) { %w(auto pferd haus hase garten) }
55
65
  let(:data) { FactoryGirl.build(:data) }
@@ -109,6 +119,7 @@ describe Selector::Simple do
109
119
  context "parallel" do
110
120
  let(:parallel) { Selector::Simple.new(:function, parallel: true) }
111
121
  before(:each) do
122
+ require 'parallel'
112
123
  simple.stubs(:global_dictionary).returns(dictionary)
113
124
  parallel.stubs(:global_dictionary).returns(dictionary)
114
125
  end
@@ -119,4 +130,4 @@ describe Selector::Simple do
119
130
  end
120
131
  end
121
132
  end
122
- end
133
+ end
@@ -1,8 +1,7 @@
1
1
  require "spec_helper"
2
2
 
3
- describe Selector::WithBinaryEncoding do
4
- it_behaves_like 'a selector'
5
- let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
3
+ describe "binary encoded classification" do
4
+ let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
6
5
 
7
6
  let(:dictionary) { %w(auto pferd haus hase garten) }
8
7
  let(:data) { FactoryGirl.build(:data) }
@@ -17,5 +17,5 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.add_dependency('parallel', '~> 0.6.2')
20
+ gem.add_dependency "ruby-stemmer"
21
21
  end
metadata CHANGED
@@ -1,32 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svm_helper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
5
- prerelease:
4
+ version: 0.2.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Andreas Eger
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-15 00:00:00.000000000 Z
11
+ date: 2013-04-25 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
- name: parallel
14
+ name: ruby-stemmer
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
- version: 0.6.2
19
+ version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
- version: 0.6.2
26
+ version: '0'
30
27
  description: Shared helper classes for usage in context of SVM at experteer
31
28
  email:
32
29
  - dev@eger-andreas.de
@@ -48,16 +45,21 @@ files:
48
45
  - lib/svm_helper.rb
49
46
  - lib/svm_helper/feature_vector.rb
50
47
  - lib/svm_helper/interface_helper.rb
48
+ - lib/svm_helper/parallel_helper.rb
51
49
  - lib/svm_helper/preprocessed_data.rb
52
50
  - lib/svm_helper/preprocessors.rb
53
51
  - lib/svm_helper/preprocessors/id_mapping.rb
54
52
  - lib/svm_helper/preprocessors/simple.rb
53
+ - lib/svm_helper/preprocessors/stemming.rb
55
54
  - lib/svm_helper/selectors.rb
56
- - lib/svm_helper/selectors/n_gram.rb
55
+ - lib/svm_helper/selectors/bi_normal_seperation.rb
56
+ - lib/svm_helper/selectors/bns_ig.rb
57
+ - lib/svm_helper/selectors/calc.rb
58
+ - lib/svm_helper/selectors/information_gain.rb
57
59
  - lib/svm_helper/selectors/simple.rb
58
- - lib/svm_helper/selectors/with_binary_encoding.rb
59
60
  - lib/svm_helper/stopwords/de
60
61
  - lib/svm_helper/stopwords/en
62
+ - lib/svm_helper/stopwords/fr
61
63
  - lib/svm_helper/version.rb
62
64
  - spec/factories.rb
63
65
  - spec/factories/jobs/tmp.html
@@ -65,44 +67,43 @@ files:
65
67
  - spec/factories/jobs/tmp3.html
66
68
  - spec/factories/jobs_with_description.rb
67
69
  - spec/factories/jobs_with_title.rb
68
- - spec/preprocessors/id_mapping_spec.rb
69
- - spec/preprocessors/simple_spec.rb
70
- - spec/selectors/n_gram_spec.rb
71
- - spec/selectors/simple_spec.rb
72
- - spec/selectors/with_binary_encoding_spec.rb
73
70
  - spec/spec_helper.rb
74
71
  - spec/support/preprocessor_spec.rb
75
72
  - spec/support/selector_spec.rb
73
+ - spec/svm_helper/parallel_helper_spec.rb
74
+ - spec/svm_helper/preprocessors/id_mapping_spec.rb
75
+ - spec/svm_helper/preprocessors/simple_spec.rb
76
+ - spec/svm_helper/preprocessors/stemming_spec.rb
77
+ - spec/svm_helper/selectors/bi_normal_seperation_spec.rb
78
+ - spec/svm_helper/selectors/bns_ig_spec.rb
79
+ - spec/svm_helper/selectors/calc_spec.rb
80
+ - spec/svm_helper/selectors/information_gain_spec.rb
81
+ - spec/svm_helper/selectors/n_gram_spec.rb
82
+ - spec/svm_helper/selectors/simple_spec.rb
83
+ - spec/svm_helper/selectors/with_binary_encoding_spec.rb
76
84
  - svm_helper.gemspec
77
85
  homepage: https://github.com/sch1zo/svm_helper
78
86
  licenses: []
87
+ metadata: {}
79
88
  post_install_message:
80
89
  rdoc_options: []
81
90
  require_paths:
82
91
  - lib
83
92
  required_ruby_version: !ruby/object:Gem::Requirement
84
- none: false
85
93
  requirements:
86
94
  - - '>='
87
95
  - !ruby/object:Gem::Version
88
96
  version: '0'
89
- segments:
90
- - 0
91
- hash: 2037039748537332986
92
97
  required_rubygems_version: !ruby/object:Gem::Requirement
93
- none: false
94
98
  requirements:
95
99
  - - '>='
96
100
  - !ruby/object:Gem::Version
97
101
  version: '0'
98
- segments:
99
- - 0
100
- hash: 2037039748537332986
101
102
  requirements: []
102
103
  rubyforge_project:
103
- rubygems_version: 1.8.25
104
+ rubygems_version: 2.0.0.rc.2
104
105
  signing_key:
105
- specification_version: 3
106
+ specification_version: 4
106
107
  summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
107
108
  test_files:
108
109
  - spec/factories.rb
@@ -111,12 +112,18 @@ test_files:
111
112
  - spec/factories/jobs/tmp3.html
112
113
  - spec/factories/jobs_with_description.rb
113
114
  - spec/factories/jobs_with_title.rb
114
- - spec/preprocessors/id_mapping_spec.rb
115
- - spec/preprocessors/simple_spec.rb
116
- - spec/selectors/n_gram_spec.rb
117
- - spec/selectors/simple_spec.rb
118
- - spec/selectors/with_binary_encoding_spec.rb
119
115
  - spec/spec_helper.rb
120
116
  - spec/support/preprocessor_spec.rb
121
117
  - spec/support/selector_spec.rb
118
+ - spec/svm_helper/parallel_helper_spec.rb
119
+ - spec/svm_helper/preprocessors/id_mapping_spec.rb
120
+ - spec/svm_helper/preprocessors/simple_spec.rb
121
+ - spec/svm_helper/preprocessors/stemming_spec.rb
122
+ - spec/svm_helper/selectors/bi_normal_seperation_spec.rb
123
+ - spec/svm_helper/selectors/bns_ig_spec.rb
124
+ - spec/svm_helper/selectors/calc_spec.rb
125
+ - spec/svm_helper/selectors/information_gain_spec.rb
126
+ - spec/svm_helper/selectors/n_gram_spec.rb
127
+ - spec/svm_helper/selectors/simple_spec.rb
128
+ - spec/svm_helper/selectors/with_binary_encoding_spec.rb
122
129
  has_rdoc:
@@ -1,31 +0,0 @@
1
- require_relative 'simple'
2
- module Selector
3
- #
4
- # Selector which uses a n-gram dictionary to generate feature vectors
5
- #
6
- # @author Andreas Eger
7
- #
8
- class NGram < Selector::Simple
9
- attr_reader :gram_size
10
-
11
- def initialize classification, args={}
12
- super
13
- @gram_size = args.fetch(:gram_size) { 2 }
14
- end
15
-
16
- def label
17
- "ngram"
18
- end
19
- #
20
- # fetches all words snippets from one data entry, removes stopwords and very short words
21
- # @param data [PreprocessedData]
22
- # @param gram_size [Integer] gram size
23
- #
24
- # @return [Array<String>]
25
- def extract_words_from_data data, gram_size=@gram_size
26
- (data.data.flat_map(&:split) - stopwords)
27
- .delete_if { |e| e.size <= 3 }
28
- .each_cons(gram_size).map{|e| e.join " " }
29
- end
30
- end
31
- end
@@ -1,41 +0,0 @@
1
- require_relative 'simple'
2
- module Selector
3
- #
4
- # Selector which uses a n-gram dictionary to generate feature vectors
5
- #
6
- # @author Andreas Eger
7
- #
8
- class WithBinaryEncoding < Selector::Simple
9
-
10
- CLASSIFICATIONS_SIZE = {
11
- function: 8, # max id 255, currently 19
12
- industry: 16, # max id 65535, currently 14370
13
- career_level: 4 } # max id 15, currently 8
14
-
15
- def initialize *args
16
- super
17
- end
18
-
19
- def label
20
- "simple-WithBinaryEncoding"
21
- end
22
-
23
- private
24
- #
25
- # creates the classification specific part of the feature vector
26
- # @param ids [Hash] hash with classification ids
27
- #
28
- # @return [Array<Integer>] binary encoded classification id
29
- def classification_array(id)
30
- number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
31
- end
32
-
33
- def number_to_binary_array(number, size=8)
34
- a=[]
35
- (size-1).downto(0) do |i|
36
- a<<number[i]
37
- end
38
- a
39
- end
40
- end
41
- end