svm_helper 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +1 -0
  3. data/Guardfile +1 -1
  4. data/lib/svm_helper.rb +1 -2
  5. data/lib/svm_helper/parallel_helper.rb +24 -0
  6. data/lib/svm_helper/preprocessors.rb +1 -0
  7. data/lib/svm_helper/preprocessors/simple.rb +31 -22
  8. data/lib/svm_helper/preprocessors/stemming.rb +31 -0
  9. data/lib/svm_helper/selectors.rb +4 -2
  10. data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
  11. data/lib/svm_helper/selectors/bns_ig.rb +50 -0
  12. data/lib/svm_helper/selectors/calc.rb +71 -0
  13. data/lib/svm_helper/selectors/information_gain.rb +49 -0
  14. data/lib/svm_helper/selectors/simple.rb +80 -40
  15. data/lib/svm_helper/stopwords/fr +124 -0
  16. data/lib/svm_helper/version.rb +1 -1
  17. data/spec/factories.rb +4 -3
  18. data/spec/support/selector_spec.rb +2 -4
  19. data/spec/svm_helper/parallel_helper_spec.rb +17 -0
  20. data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
  21. data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
  22. data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
  23. data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
  24. data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
  25. data/spec/svm_helper/selectors/calc_spec.rb +42 -0
  26. data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
  27. data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
  28. data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
  29. data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
  30. data/svm_helper.gemspec +1 -1
  31. metadata +39 -32
  32. data/lib/svm_helper/selectors/n_gram.rb +0 -31
  33. data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
4
+ data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
5
+ SHA512:
6
+ metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
7
+ data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8
data/Gemfile CHANGED
@@ -21,4 +21,5 @@ group :test do
21
21
  gem 'rake'
22
22
  gem 'mocha', require: 'mocha/api'
23
23
  gem 'factory_girl', '~> 4.0'
24
+ gem 'parallel', require: false
24
25
  end
data/Guardfile CHANGED
@@ -1,7 +1,7 @@
1
1
  guard 'rspec', cli: "--color --format p", all_after_pass: false do
2
2
  # guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
3
3
  watch(%r{^spec/.+_spec\.rb$})
4
- watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
5
5
  watch('spec/spec_helper.rb') { 'spec' }
6
6
  watch('spec/factories.rb') { 'spec' }
7
7
  watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
@@ -1,8 +1,7 @@
1
1
  require "svm_helper/version"
2
- require 'parallel'
3
2
 
3
+ require "svm_helper/parallel_helper"
4
4
  require "svm_helper/preprocessed_data"
5
5
  require "svm_helper/feature_vector"
6
6
  require "svm_helper/preprocessors"
7
7
  require "svm_helper/selectors"
8
-
@@ -0,0 +1,24 @@
1
+ module ParallelHelper
2
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
3
+ def p_map_with_index data, &block
4
+ if parallel? && RUBY_PLATFORM == 'java'
5
+ Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
6
+ elsif parallel?
7
+ Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
8
+ else
9
+ data.map.with_index {|e,i| yield e,i }
10
+ end
11
+ end
12
+ def p_map data, &block
13
+ if parallel? && RUBY_PLATFORM == 'java'
14
+ Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
15
+ elsif parallel?
16
+ Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
17
+ else
18
+ data.map {|e| yield e }
19
+ end
20
+ end
21
+ def parallel?
22
+ defined?(Parallel) == 'constant' && @parallel
23
+ end
24
+ end
@@ -1,2 +1,3 @@
1
1
  require_relative 'preprocessors/simple'
2
+ require_relative 'preprocessors/stemming'
2
3
  require_relative 'preprocessors/id_mapping'
@@ -6,7 +6,7 @@ module Preprocessor
6
6
  # @author Andreas Eger
7
7
  #
8
8
  class Simple
9
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
+ include ::ParallelHelper
10
10
  # filters most gender stuff
11
11
  GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
12
12
  # filters most wierd symbols
@@ -25,8 +25,16 @@ module Preprocessor
25
25
  # filter for used job tokens
26
26
  CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
27
27
 
28
+ # stopword file
29
+ #TODO use File.expand_path
30
+ STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
31
+ attr_accessor :language
32
+
33
+
28
34
  def initialize args={}
35
+ @language = args.fetch(:language){'en'}
29
36
  @parallel = args.fetch(:parallel){false}
37
+ @stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
30
38
  end
31
39
 
32
40
  def label
@@ -48,12 +56,20 @@ module Preprocessor
48
56
  # @return [Array<PreprocessedData>] list of processed job data - or singe job data
49
57
  def process jobs
50
58
  if jobs.is_a? Array
51
- process_jobs jobs
59
+ p_map(jobs) {|job| process_job job }
52
60
  else
53
61
  process_job jobs
54
62
  end
55
63
  end
56
64
 
65
+ #
66
+ # loads a txt file with stop words
67
+ # @param location String folder with stopword lists
68
+ #
69
+ # @return [Array<String>] Array of stopwords
70
+ def strip_stopwords(text)
71
+ (text.split - @stopwords).delete_if { |e| e.size <= 2 }
72
+ end
57
73
 
58
74
  #
59
75
  # converts string into a cleaner version
@@ -75,29 +91,22 @@ module Preprocessor
75
91
  #
76
92
  # @return [String] clean and lowercase version of input
77
93
  def clean_description desc
78
- desc.gsub(XML_TAG_FILTER,' ')
79
- .gsub(EMAIL_FILTER,'')
80
- .gsub(URL_FILTER,'')
81
- .gsub(GENDER_FILTER,'')
82
- .gsub(NEW_LINES,'')
83
- .gsub(SYMBOL_FILTER,' ')
84
- .gsub(WHITESPACE,' ')
85
- .gsub(WORDS_IN_BRACKETS, '\1')
86
- .gsub(CODE_TOKEN_FILTER,'')
87
- .downcase
88
- .strip
94
+ strip_stopwords(
95
+ desc.gsub(XML_TAG_FILTER,' ')
96
+ .gsub(EMAIL_FILTER,'')
97
+ .gsub(URL_FILTER,'')
98
+ .gsub(GENDER_FILTER,'')
99
+ .gsub(NEW_LINES,'')
100
+ .gsub(SYMBOL_FILTER,' ')
101
+ .gsub(WHITESPACE,' ')
102
+ .gsub(WORDS_IN_BRACKETS, '\1')
103
+ .gsub(CODE_TOKEN_FILTER,'')
104
+ .downcase
105
+ .strip
106
+ )
89
107
  end
90
108
 
91
109
  private
92
- def process_jobs jobs
93
- if @parallel && RUBY_PLATFORM == 'java'
94
- Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
95
- elsif @parallel
96
- Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
97
- else
98
- jobs.map {|job| process_job job }
99
- end
100
- end
101
110
 
102
111
  def process_job job
103
112
  PreprocessedData.new(
@@ -0,0 +1,31 @@
1
+ require_relative 'simple'
2
+ require 'lingua/stemmer'
3
+ module Preprocessor
4
+ #
5
+ # Preprocessor Base Class
6
+ #
7
+ # @author Andreas Eger
8
+ #
9
+ class Stemming < Simple
10
+
11
+ def initialize(args={})
12
+ super
13
+ @stemmer = Lingua::Stemmer.new(language: @language)
14
+ end
15
+ def label
16
+ "with_stemming"
17
+ end
18
+
19
+ def clean_description desc
20
+ super.map{|w| @stemmer.stem(w) }
21
+ end
22
+ private
23
+ def process_job job
24
+ PreprocessedData.new(
25
+ data: [clean_title(job[:title]), clean_description(job[:description])],
26
+ id: job[:id],
27
+ label: job[:label]
28
+ )
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,5 @@
1
1
  require_relative 'selectors/simple'
2
- require_relative 'selectors/n_gram'
3
- require_relative 'selectors/with_binary_encoding'
2
+ require_relative 'selectors/calc'
3
+ require_relative 'selectors/bi_normal_seperation'
4
+ require_relative 'selectors/information_gain'
5
+ require_relative 'selectors/bns_ig'
@@ -0,0 +1,86 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class BiNormalSeperation < Selector::Simple
8
+ include BNS
9
+
10
+ def label
11
+ "BiNormalSeperation"
12
+ end
13
+
14
+ def initialize classification, args={}
15
+ super
16
+ @word_selection = args.fetch(:word_selection){ :grams1_2 }
17
+ end
18
+ #
19
+ # generates a list of feature vetors and their labels from preprocessed data
20
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
21
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
22
+ # @param dictionary_size [Integer] Size of a dictionary to create if non exists
23
+ #
24
+ # @return [Array<FeatureVector>] list of feature vectors and labels
25
+ def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
26
+ words_and_label_per_data = extract_words data_set, true
27
+ generate_global_dictionary words_and_label_per_data, dictionary_size
28
+
29
+ words_per_data = words_and_label_per_data.map(&:features)
30
+ p_map_with_index(words_per_data) do |words,index|
31
+ word_set = words.uniq
32
+ make_vector word_set, data_set[index]
33
+ end
34
+ end
35
+
36
+ #
37
+ # generates a list of words used as dictionary
38
+ # @param all_words (see #extract_words)
39
+ # @param size dictionary size
40
+ #
41
+ # @return [Array<String>] list of words
42
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
43
+ return unless global_dictionary.empty?
44
+
45
+ label_counts = [0,0]
46
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
47
+ label = bag.label ? 1 : 0
48
+ label_counts[label] += 1
49
+ # only count a feature once per bag
50
+ bag.features.uniq.each do |word|
51
+ unless accumulator.has_key?(word)
52
+ accumulator[word] = [0,0]
53
+ end
54
+ accumulator[word][label] += 1
55
+ end
56
+ accumulator
57
+ end
58
+ neg, pos = label_counts
59
+ words = p_map(features) do |word, counts|
60
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
61
+ bns = bi_normal_seperation(pos, neg, *counts)
62
+ [word, bns.abs]
63
+ end
64
+ @global_dictionary = words.compact
65
+ .sort_by{|e| e[1]}
66
+ .last(size)
67
+ .map{|e| e[0] }
68
+ end
69
+
70
+ def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
71
+ words_per_data = extract_words data_set, true
72
+ generate_global_dictionary words_per_data, dictionary_size
73
+ end
74
+ #
75
+ # extracts the words of all provided data entries
76
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
77
+ # @param keep_label
78
+ #
79
+ # @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
80
+ def extract_words data_set, keep_label=false
81
+ data_set.map do |data|
82
+ extract_words_from_data data, keep_label
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,50 @@
1
+ require_relative 'bi_normal_seperation'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class BNS_IG < Selector::BiNormalSeperation
8
+ include IG
9
+
10
+ def label
11
+ "BiNormalSeperation_InformationGain"
12
+ end
13
+
14
+ #
15
+ # generates a list of words used as dictionary
16
+ # @param all_words (see #extract_words)
17
+ # @param size dictionary size
18
+ #
19
+ # @return [Array<String>] list of words
20
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
21
+ return unless global_dictionary.empty?
22
+
23
+ label_counts = [0,0]
24
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
25
+ label = bag.label ? 1 : 0
26
+ label_counts[label] += 1
27
+ # only count a feature once per bag
28
+ bag.features.uniq.each do |word|
29
+ unless accumulator.has_key?(word)
30
+ accumulator[word] = [0,0]
31
+ end
32
+ accumulator[word][label] += 1
33
+ end
34
+ accumulator
35
+ end
36
+ neg, pos = label_counts
37
+ words = p_map(features) do |word, counts|
38
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
39
+ bns = bi_normal_seperation(pos, neg, *counts)
40
+ ig = information_gain(pos, neg, *counts)
41
+ # use geometric mean of BNS and IG
42
+ [word, Math.sqrt(bns.abs * ig.abs)]
43
+ end
44
+ @global_dictionary = words.compact
45
+ .sort_by{|e| e[1]}
46
+ .last(size)
47
+ .map{|e| e[0] }
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,71 @@
1
+ module Selector
2
+ module IG
3
+ def information_gain(pos, neg, tp, fp)
4
+ fn = neg - fp
5
+ tn = pos - tp
6
+ p_word = (tp + fp).quo(pos + neg)
7
+
8
+ e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
9
+ end
10
+ def e(x,y)
11
+ -xlx(x.quo(x+y)) -xlx(y.quo(x+y))
12
+ end
13
+ def xlx(x)
14
+ x * Math.log2(x)
15
+ end
16
+ end
17
+ module BNS
18
+ SQR2 = Math.sqrt(2)
19
+ SQR2PI = Math.sqrt(2.0*Math::PI)
20
+
21
+ def bi_normal_seperation pos, neg, tp, fp
22
+ false_prositive_rate = fp.quo(neg)
23
+ true_prositive_rate = tp.quo(pos)
24
+ bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
25
+ end
26
+ # standard normal cumulative distribution function
27
+ def cdf(z)
28
+ 0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
29
+ end
30
+
31
+ # inverse standard normal cumulative distribution function
32
+ # http://home.online.no/~pjacklam/notes/invnorm
33
+
34
+ # Coefficients in rational approximations.
35
+ A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
36
+ B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
37
+ C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
38
+ D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
39
+ # Define break-points.
40
+ P_LOW = 0.02425
41
+ P_HIGH = 1.0 - P_LOW
42
+
43
+ def cdf_inverse(p)
44
+ return 0.0 if p < 0 || p > 1 || p == 0.5
45
+ x = 0.0
46
+
47
+ if 0.0 < p && p < P_LOW
48
+ # Rational approximation for lower region.
49
+ q = Math.sqrt(-2.0*Math.log(p))
50
+ x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
51
+ ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
52
+ elsif P_LOW <= p && p <= P_HIGH
53
+ # Rational approximation for central region.
54
+ q = p - 0.5
55
+ r = q*q
56
+ x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
57
+ (((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
58
+ elsif P_HIGH < p && p < 1.0
59
+ # Rational approximation for upper region.
60
+ q = Math.sqrt(-2.0*Math.log(1.0-p))
61
+ x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
62
+ ((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
63
+ end
64
+ if 0 < p && p < 1
65
+ u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
66
+ x = x - u/(1.0 + x*u/2.0)
67
+ end
68
+ x
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,49 @@
1
+ require_relative 'bi_normal_seperation'
2
+ module Selector
3
+ #
4
+ # Feature Selection for Text Classification - HP Labs
5
+ # http://www.google.com/patents/US20040059697
6
+ #
7
+ class InformationGain < Selector::BiNormalSeperation
8
+ include IG
9
+
10
+ def label
11
+ "InformationGain"
12
+ end
13
+
14
+ #
15
+ # generates a list of words used as dictionary
16
+ # @param all_words (see #extract_words)
17
+ # @param size dictionary size
18
+ #
19
+ # @return [Array<String>] list of words
20
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
21
+ return unless global_dictionary.empty?
22
+
23
+ label_counts = [0,0]
24
+ features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
25
+ label = bag.label ? 1 : 0
26
+ label_counts[label] += 1
27
+ # only count a feature once per bag
28
+ bag.features.uniq.each do |word|
29
+ unless accumulator.has_key?(word)
30
+ accumulator[word] = [0,0]
31
+ end
32
+ accumulator[word][label] += 1
33
+ end
34
+ accumulator
35
+ end
36
+ neg, pos = label_counts
37
+ words = p_map(features) do |word, counts|
38
+ next if counts.any? { |e| e==0 } # skip words only appearing in one class
39
+ tp, fp = counts
40
+ ig = information_gain(pos, neg, tp, fp)
41
+ [word, ig.abs]
42
+ end
43
+ @global_dictionary = words.compact
44
+ .sort_by{|e| e[1]}
45
+ .last(size)
46
+ .map{|e| e[0] }
47
+ end
48
+ end
49
+ end
@@ -5,31 +5,20 @@ module Selector
5
5
  # @author Andreas Eger
6
6
  #
7
7
  class Simple
8
- THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
- # stopword file
10
- #TODO use File.expand_path
11
- STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
8
+ include ::ParallelHelper
12
9
  # default dictionary size
13
10
  DEFAULT_DICTIONARY_SIZE = 800
14
11
 
15
- CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
16
- { function: Pjpp::Function.count,
17
- industry: Pjpp::Industry.count,
18
- career_level: Pjpp::CareerLevel.count }
19
- else
20
- { function: 19, # 1..19
21
- industry: 632, # 1..14370 but not all ids used
22
- career_level: 8 } # 1..8
23
- end
24
-
25
-
26
-
27
12
  attr_accessor :global_dictionary
28
-
13
+ attr_reader :classification_encoding,
14
+ :gram_size,
15
+ :word_selection
29
16
  def initialize classification, args={}
30
17
  @classification = classification
31
18
  @global_dictionary = args.fetch(:global_dictionary) {[]}
32
- @language = args.fetch(:language){'en'}
19
+ @classification_encoding = args.fetch(:classification_encoding){:bitmap}
20
+ @word_selection = args.fetch(:word_selection){ :single }
21
+ @gram_size = args.fetch(:gram_size) { 1 }
33
22
  @parallel = args.fetch(:parallel){false}
34
23
  end
35
24
 
@@ -48,7 +37,7 @@ module Selector
48
37
  words_per_data = extract_words data_set
49
38
  generate_global_dictionary words_per_data, dictionary_size
50
39
 
51
- make_vectors(words_per_data) do |words,index|
40
+ p_map_with_index(words_per_data) do |words,index|
52
41
  word_set = words.uniq
53
42
  make_vector word_set, data_set[index]
54
43
  end
@@ -66,15 +55,6 @@ module Selector
66
55
  make_vector word_set, data, dictionary
67
56
  end
68
57
 
69
- #
70
- # loads a txt file with stop words
71
- # @param location String folder with stopword lists
72
- #
73
- # @return [Array<String>] Array of stopwords
74
- def stopwords(location=STOPWORD_LOCATION)
75
- @stopwords ||= IO.read(File.join(location,@language)).split
76
- end
77
-
78
58
  #
79
59
  # generates a list of words used as dictionary
80
60
  # @param all_words (see #extract_words)
@@ -90,6 +70,10 @@ module Selector
90
70
  @global_dictionary = words.last(size).map(&:first).reverse
91
71
  end
92
72
 
73
+ def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
74
+ words_per_data = extract_words data_set
75
+ generate_global_dictionary words_per_data, dictionary_size
76
+ end
93
77
  #
94
78
  # extracts the words of all provided data entries
95
79
  # @param data_set [Array<PreprocessedData>] list of preprocessed data
@@ -107,7 +91,46 @@ module Selector
107
91
  #
108
92
  # @return [Array<String>] list of words
109
93
  def extract_words_from_data data
110
- (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
94
+ words = (data.data.flat_map(&:split) - stopwords)
95
+ .delete_if { |e| e.size <= 2 }
96
+ if gram_size > 1
97
+ words = words.each_cons(@gram_size).map{|e| e.join " " }
98
+ end
99
+ words
100
+ end
101
+
102
+ #
103
+ # fetches all words and two word phrases from one data entry, removes stopwords and very short words
104
+ # @param data [PreprocessedData] preprocessed data entry
105
+ # @param keep_label
106
+ #
107
+ # @return [OpenStruct<Array<String>,Boolean>] list of words
108
+ def extract_words_from_data data, keep_label=false
109
+ # assume the first token is the title an preserve it
110
+ title, *words = data.data.flatten
111
+ features = case word_selection
112
+ when :grams
113
+ words.each_cons(@gram_size).map{|e| e.join " " }
114
+ when :grams1_2
115
+ words + words.each_cons(2).map{|e| e.join " " }
116
+ when :grams1_2_3
117
+ words +
118
+ words.each_cons(2).map{|e| e.join " " } +
119
+ words.each_cons(3).map{|e| e.join " " }
120
+ when :grams1_2_3_4
121
+ words +
122
+ words.each_cons(2).map{|e| e.join " " } +
123
+ words.each_cons(3).map{|e| e.join " " } +
124
+ words.each_cons(4).map{|e| e.join " " }
125
+ else
126
+ words
127
+ end
128
+ features.unshift(title)
129
+ return features unless keep_label
130
+ OpenStruct.new(
131
+ features: features,
132
+ label: data.label
133
+ )
111
134
  end
112
135
 
113
136
  def reset classification
@@ -135,23 +158,40 @@ module Selector
135
158
  )
136
159
  end
137
160
 
138
- def make_vectors data, &block
139
- if @parallel && RUBY_PLATFORM == 'java'
140
- Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
141
- elsif @parallel
142
- Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
143
- else
144
- data.map.with_index {|e,i| yield e,i }
145
- end
146
- end
161
+ BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
162
+ { function: Pjpp::Function.count,
163
+ industry: Pjpp::Industry.count,
164
+ career_level: Pjpp::CareerLevel.count }
165
+ else
166
+ { function: 19, # 1..19
167
+ industry: 632, # 1..14370 but not all ids used
168
+ career_level: 8 } # 1..8
169
+ end
147
170
 
171
+ BINARY_ARRAY_SIZES = {
172
+ function: 8, # max id 255, currently 19
173
+ industry: 16, # max id 65535, currently 14370
174
+ career_level: 4 } # max id 15, currently 8
148
175
  #
149
176
  # creates the classification specific part of the feature vector
150
177
  # @param ids [Hash] hash with classification ids
151
178
  #
152
179
  # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
153
180
  def classification_array(id)
154
- Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
181
+ case @classification_encoding
182
+ when :binary
183
+ number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
184
+ else # :bitmap
185
+ Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
186
+ end
187
+ end
188
+
189
+ def number_to_binary_array(number, size=8)
190
+ a=[]
191
+ (size-1).downto(0) do |i|
192
+ a<<number[i]
193
+ end
194
+ a
155
195
  end
156
196
  end
157
- end
197
+ end
@@ -0,0 +1,124 @@
1
+ alors
2
+ au
3
+ aucuns
4
+ aussi
5
+ autre
6
+ avant
7
+ avec
8
+ avoir
9
+ bon
10
+ car
11
+ ce
12
+ cela
13
+ ces
14
+ ceux
15
+ chaque
16
+ ci
17
+ comme
18
+ comment
19
+ dans
20
+ des
21
+ du
22
+ dedans
23
+ dehors
24
+ depuis
25
+ deux
26
+ devrait
27
+ doit
28
+ donc
29
+ dos
30
+ droite
31
+ début
32
+ elle
33
+ elles
34
+ en
35
+ encore
36
+ essai
37
+ est
38
+ et
39
+ eu
40
+ fait
41
+ faites
42
+ fois
43
+ font
44
+ force
45
+ haut
46
+ hors
47
+ ici
48
+ il
49
+ ils
50
+ je juste
51
+ la
52
+ le
53
+ les
54
+ leur
55
+
56
+ ma
57
+ maintenant
58
+ mais
59
+ mes
60
+ mine
61
+ moins
62
+ mon
63
+ mot
64
+ même
65
+ ni
66
+ nommés
67
+ notre
68
+ nous
69
+ nouveaux
70
+ ou
71
+
72
+ par
73
+ parce
74
+ parole
75
+ pas
76
+ personnes
77
+ peut
78
+ peu
79
+ pièce
80
+ plupart
81
+ pour
82
+ pourquoi
83
+ quand
84
+ que
85
+ quel
86
+ quelle
87
+ quelles
88
+ quels
89
+ qui
90
+ sa
91
+ sans
92
+ ses
93
+ seulement
94
+ si
95
+ sien
96
+ son
97
+ sont
98
+ sous
99
+ soyez sujet
100
+ sur
101
+ ta
102
+ tandis
103
+ tellement
104
+ tels
105
+ tes
106
+ ton
107
+ tous
108
+ tout
109
+ trop
110
+ très
111
+ tu
112
+ valeur
113
+ voie
114
+ voient
115
+ vont
116
+ votre
117
+ vous
118
+ vu
119
+ ça
120
+ étaient
121
+ état
122
+ étions
123
+ été
124
+ être
@@ -1,3 +1,3 @@
1
1
  module SvmHelper
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -18,14 +18,15 @@ FactoryGirl.define do
18
18
 
19
19
 
20
20
  factory :data, class: PreprocessedData do
21
- data ["haus fooo garten baaz pferd fooo"]
21
+ data %w(haus fooo garten baaz pferd fooo)
22
22
  id 7
23
23
  label true
24
24
  end
25
25
  factory :data_w_short_words, parent: :data do
26
- data ["auto foo pferd bz gooo fooo 2"]
26
+ data %w(auto pferd gooo fooo)
27
+ label false
27
28
  end
28
29
  factory :data_w_multiple_sections, parent: :data do
29
- data ["meeh foo auto","bz baaz fooo 2"]
30
+ data [%w(meeh auto),%w(baaz fooo)]
30
31
  end
31
32
  end
@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
13
13
  [0,1].should include(e)
14
14
  end
15
15
  end
16
- it "should be able to process multiple data entries at once" do
17
- selector.generate_vectors([data]).each do |e|
18
- e.should == selector.generate_vector(data)
19
- end
16
+ it "should respond to generate_vectors" do
17
+ selector.should respond_to(:generate_vectors)
20
18
  end
21
19
  end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+ require 'parallel'
3
+
4
+ include ParallelHelper
5
+ describe ParallelHelper do
6
+ let(:data) { (1..20).to_a }
7
+ context "parallel map" do
8
+ it "should return as a normal map" do
9
+ p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
10
+ end
11
+ end
12
+ context "parallel map with index" do
13
+ it "should return as a normal map with index" do
14
+ p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
15
+ end
16
+ end
17
+ end
@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
41
41
  end
42
42
  end
43
43
 
44
+
44
45
  context "#clean_title" do
45
46
  it "should be downcased" do
46
47
  job = FactoryGirl.build(:job_title_downcasing)
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
75
76
  FactoryGirl.build(:job_description_w_code_token),
76
77
  FactoryGirl.build(:job_description_w_gender) ]
77
78
  }
79
+ it "should call strip_stopwords" do
80
+ simple.expects(:strip_stopwords)
81
+ simple.clean_description(jobs[0][:description])
82
+ end
78
83
  it "should remove html/xml tags" do
79
- desc = simple.clean_description(jobs[0][:description])
84
+ desc = simple.clean_description(jobs[0][:description]).join ' '
80
85
  desc.should_not match(/<(.*?)>/)
81
86
  end
82
87
  it "should remove new lines" do
83
- desc = simple.clean_description(jobs[0][:description])
88
+ desc = simple.clean_description(jobs[0][:description]).join ' '
84
89
  desc.should_not match(/\r\n|\n|\r/)
85
90
  end
86
91
  it "should remove all special characters" do
87
- desc = simple.clean_description(jobs[2][:description])
92
+ desc = simple.clean_description(jobs[2][:description]).join ' '
88
93
  desc.should_not match(/[^a-z öäü]/i)
89
94
  end
90
95
  it "should remove gender tokens" do
91
- desc = simple.clean_description(jobs[3][:description])
96
+ desc = simple.clean_description(jobs[3][:description]).join ' '
92
97
  desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
93
98
  end
94
99
  it "should remove job code token" do
95
- desc = simple.clean_description(jobs[4][:description])
100
+ desc = simple.clean_description(jobs[4][:description]).join ' '
96
101
  desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
97
102
  end
98
103
  it "should be downcased" do
99
- desc = simple.clean_description(jobs[2][:description])
104
+ desc = simple.clean_description(jobs[2][:description]).join ' '
100
105
  desc.should_not match(/[^a-z öäü]/)
101
106
  end
102
107
  end
108
+
109
+ context "strip_stopwords" do
110
+ it "should remove words like 'and' from the text" do
111
+ simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
112
+ end
113
+ end
103
114
  context "parallel" do
104
115
  let(:parallel) { Preprocessor::Simple.new(parallel: true) }
105
116
 
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe Preprocessor::Stemming do
4
+ it_behaves_like 'a preprocessor'
5
+ let(:preprocessor) { Preprocessor::Stemming.new }
6
+ let(:job) { FactoryGirl.build(:job) }
7
+ let(:jobs) { [job] }
8
+ it "should reduce words to their stem" do
9
+ preprocessor.clean_description("developer engineering").should == %w(develop engin)
10
+ end
11
+ end
@@ -0,0 +1,35 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::BiNormalSeperation do
4
+ it_behaves_like 'a selector'
5
+
6
+ let(:bns) { Selector::BiNormalSeperation.new(:function) }
7
+ context "#extract_words_from_data" do
8
+ it "should generate a list of words from the data" do
9
+ words = bns.extract_words_from_data(FactoryGirl.build(:data))
10
+ words.should have(10).things
11
+ end
12
+ it "should remove words with 3 characters or less" do
13
+ words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
14
+ words.should have(6).things
15
+ end
16
+ it "should process multiple sections in the data" do
17
+ words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
18
+ words.should have(6).things
19
+ end
20
+ end
21
+ context "#generate_global_dictionary" do
22
+ let(:data) { [FactoryGirl.build_list(:data,1),
23
+ FactoryGirl.build_list(:data_w_short_words,4),
24
+ FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
25
+ let(:words_per_data) { bns.extract_words(data,true) }
26
+ it "should return a list of n words" do
27
+ bns.generate_global_dictionary(words_per_data,2)
28
+ bns.global_dictionary.should have(2).things
29
+ end
30
+ it "should return a list of the n most used words in the data array" do
31
+ bns.generate_global_dictionary(words_per_data,3)
32
+ bns.global_dictionary.should eq(%w(fooo auto pferd))
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,5 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::BNS_IG do
4
+ it_behaves_like 'a selector'
5
+ end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+
3
+ # just some very basic test to make sure these functions do not fail
4
+ describe "Calc" do
5
+ include Selector::IG
6
+ include Selector::BNS
7
+ let(:test_data){ [
8
+ [34, 23, 28, 17],
9
+ [31, 17, 23, 12],
10
+ [44, 39, 41, 36],
11
+ [44, 23, 41, 23],
12
+ [44, 39, 0, 36],
13
+ [44, 39, 41, 0],
14
+ [62, 81, 15, 73]
15
+ ]}
16
+
17
+ context Selector::IG do
18
+ it "should not fail" do
19
+ test_data.each do |data|
20
+ ->{information_gain(*data)}.should_not raise_error
21
+ end
22
+ end
23
+ it "should return some values" do
24
+ test_data.each do |data|
25
+ information_gain(*data).should be_a(Numeric)
26
+ end
27
+ end
28
+ end
29
+
30
+ context Selector::BNS do
31
+ it "should not fail" do
32
+ test_data.each do |data|
33
+ ->{bi_normal_seperation(*data)}.should_not raise_error
34
+ end
35
+ end
36
+ it "should return some values" do
37
+ test_data.each do |data|
38
+ bi_normal_seperation(*data).should be_a(Numeric)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,5 @@
1
+ require "spec_helper"
2
+
3
+ describe Selector::InformationGain do
4
+ it_behaves_like 'a selector'
5
+ end
@@ -1,9 +1,7 @@
1
1
  require "spec_helper"
2
2
 
3
- describe Selector::NGram do
4
- it_behaves_like 'a selector'
5
-
6
- let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
3
+ describe "n-grams" do
4
+ let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
7
5
  context "#extract_words_from_data" do
8
6
  it "should generate a list of words from the data" do
9
7
  words = ngram.extract_words_from_data(FactoryGirl.build(:data))
@@ -7,9 +7,6 @@ describe Selector::Simple do
7
7
  it "should have select_feature_vector implemented" do
8
8
  expect { simple.generate_vectors([]) }.to_not raise_error
9
9
  end
10
- context "#stopwords" do
11
- it "simply loads them from a file"
12
- end
13
10
  context "#extract_words_from_data" do
14
11
  it "should generate a list of words from the data" do
15
12
  words = simple.extract_words_from_data(FactoryGirl.build(:data))
@@ -50,6 +47,19 @@ describe Selector::Simple do
50
47
  simple.global_dictionary.should eq(%w(fooo auto baaz))
51
48
  end
52
49
  end
50
+ context "#build_dictionary" do
51
+ let(:data) { [FactoryGirl.build_list(:data,1),
52
+ FactoryGirl.build_list(:data_w_short_words,2),
53
+ FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
54
+ it "should return a list of n words" do
55
+ simple.build_dictionary(data,2)
56
+ simple.global_dictionary.should have(2).things
57
+ end
58
+ it "should return a list of the n most used words in the data array" do
59
+ simple.build_dictionary(data,3)
60
+ simple.global_dictionary.should eq(%w(fooo auto baaz))
61
+ end
62
+ end
53
63
  context "#generate_vector" do
54
64
  let(:dictionary) { %w(auto pferd haus hase garten) }
55
65
  let(:data) { FactoryGirl.build(:data) }
@@ -109,6 +119,7 @@ describe Selector::Simple do
109
119
  context "parallel" do
110
120
  let(:parallel) { Selector::Simple.new(:function, parallel: true) }
111
121
  before(:each) do
122
+ require 'parallel'
112
123
  simple.stubs(:global_dictionary).returns(dictionary)
113
124
  parallel.stubs(:global_dictionary).returns(dictionary)
114
125
  end
@@ -119,4 +130,4 @@ describe Selector::Simple do
119
130
  end
120
131
  end
121
132
  end
122
- end
133
+ end
@@ -1,8 +1,7 @@
1
1
  require "spec_helper"
2
2
 
3
- describe Selector::WithBinaryEncoding do
4
- it_behaves_like 'a selector'
5
- let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
3
+ describe "binary encoded classification" do
4
+ let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
6
5
 
7
6
  let(:dictionary) { %w(auto pferd haus hase garten) }
8
7
  let(:data) { FactoryGirl.build(:data) }
@@ -17,5 +17,5 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.add_dependency('parallel', '~> 0.6.2')
20
+ gem.add_dependency "ruby-stemmer"
21
21
  end
metadata CHANGED
@@ -1,32 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svm_helper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
5
- prerelease:
4
+ version: 0.2.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Andreas Eger
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-15 00:00:00.000000000 Z
11
+ date: 2013-04-25 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
- name: parallel
14
+ name: ruby-stemmer
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
- version: 0.6.2
19
+ version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
- version: 0.6.2
26
+ version: '0'
30
27
  description: Shared helper classes for usage in context of SVM at experteer
31
28
  email:
32
29
  - dev@eger-andreas.de
@@ -48,16 +45,21 @@ files:
48
45
  - lib/svm_helper.rb
49
46
  - lib/svm_helper/feature_vector.rb
50
47
  - lib/svm_helper/interface_helper.rb
48
+ - lib/svm_helper/parallel_helper.rb
51
49
  - lib/svm_helper/preprocessed_data.rb
52
50
  - lib/svm_helper/preprocessors.rb
53
51
  - lib/svm_helper/preprocessors/id_mapping.rb
54
52
  - lib/svm_helper/preprocessors/simple.rb
53
+ - lib/svm_helper/preprocessors/stemming.rb
55
54
  - lib/svm_helper/selectors.rb
56
- - lib/svm_helper/selectors/n_gram.rb
55
+ - lib/svm_helper/selectors/bi_normal_seperation.rb
56
+ - lib/svm_helper/selectors/bns_ig.rb
57
+ - lib/svm_helper/selectors/calc.rb
58
+ - lib/svm_helper/selectors/information_gain.rb
57
59
  - lib/svm_helper/selectors/simple.rb
58
- - lib/svm_helper/selectors/with_binary_encoding.rb
59
60
  - lib/svm_helper/stopwords/de
60
61
  - lib/svm_helper/stopwords/en
62
+ - lib/svm_helper/stopwords/fr
61
63
  - lib/svm_helper/version.rb
62
64
  - spec/factories.rb
63
65
  - spec/factories/jobs/tmp.html
@@ -65,44 +67,43 @@ files:
65
67
  - spec/factories/jobs/tmp3.html
66
68
  - spec/factories/jobs_with_description.rb
67
69
  - spec/factories/jobs_with_title.rb
68
- - spec/preprocessors/id_mapping_spec.rb
69
- - spec/preprocessors/simple_spec.rb
70
- - spec/selectors/n_gram_spec.rb
71
- - spec/selectors/simple_spec.rb
72
- - spec/selectors/with_binary_encoding_spec.rb
73
70
  - spec/spec_helper.rb
74
71
  - spec/support/preprocessor_spec.rb
75
72
  - spec/support/selector_spec.rb
73
+ - spec/svm_helper/parallel_helper_spec.rb
74
+ - spec/svm_helper/preprocessors/id_mapping_spec.rb
75
+ - spec/svm_helper/preprocessors/simple_spec.rb
76
+ - spec/svm_helper/preprocessors/stemming_spec.rb
77
+ - spec/svm_helper/selectors/bi_normal_seperation_spec.rb
78
+ - spec/svm_helper/selectors/bns_ig_spec.rb
79
+ - spec/svm_helper/selectors/calc_spec.rb
80
+ - spec/svm_helper/selectors/information_gain_spec.rb
81
+ - spec/svm_helper/selectors/n_gram_spec.rb
82
+ - spec/svm_helper/selectors/simple_spec.rb
83
+ - spec/svm_helper/selectors/with_binary_encoding_spec.rb
76
84
  - svm_helper.gemspec
77
85
  homepage: https://github.com/sch1zo/svm_helper
78
86
  licenses: []
87
+ metadata: {}
79
88
  post_install_message:
80
89
  rdoc_options: []
81
90
  require_paths:
82
91
  - lib
83
92
  required_ruby_version: !ruby/object:Gem::Requirement
84
- none: false
85
93
  requirements:
86
94
  - - '>='
87
95
  - !ruby/object:Gem::Version
88
96
  version: '0'
89
- segments:
90
- - 0
91
- hash: 2037039748537332986
92
97
  required_rubygems_version: !ruby/object:Gem::Requirement
93
- none: false
94
98
  requirements:
95
99
  - - '>='
96
100
  - !ruby/object:Gem::Version
97
101
  version: '0'
98
- segments:
99
- - 0
100
- hash: 2037039748537332986
101
102
  requirements: []
102
103
  rubyforge_project:
103
- rubygems_version: 1.8.25
104
+ rubygems_version: 2.0.0.rc.2
104
105
  signing_key:
105
- specification_version: 3
106
+ specification_version: 4
106
107
  summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
107
108
  test_files:
108
109
  - spec/factories.rb
@@ -111,12 +112,18 @@ test_files:
111
112
  - spec/factories/jobs/tmp3.html
112
113
  - spec/factories/jobs_with_description.rb
113
114
  - spec/factories/jobs_with_title.rb
114
- - spec/preprocessors/id_mapping_spec.rb
115
- - spec/preprocessors/simple_spec.rb
116
- - spec/selectors/n_gram_spec.rb
117
- - spec/selectors/simple_spec.rb
118
- - spec/selectors/with_binary_encoding_spec.rb
119
115
  - spec/spec_helper.rb
120
116
  - spec/support/preprocessor_spec.rb
121
117
  - spec/support/selector_spec.rb
118
+ - spec/svm_helper/parallel_helper_spec.rb
119
+ - spec/svm_helper/preprocessors/id_mapping_spec.rb
120
+ - spec/svm_helper/preprocessors/simple_spec.rb
121
+ - spec/svm_helper/preprocessors/stemming_spec.rb
122
+ - spec/svm_helper/selectors/bi_normal_seperation_spec.rb
123
+ - spec/svm_helper/selectors/bns_ig_spec.rb
124
+ - spec/svm_helper/selectors/calc_spec.rb
125
+ - spec/svm_helper/selectors/information_gain_spec.rb
126
+ - spec/svm_helper/selectors/n_gram_spec.rb
127
+ - spec/svm_helper/selectors/simple_spec.rb
128
+ - spec/svm_helper/selectors/with_binary_encoding_spec.rb
122
129
  has_rdoc:
@@ -1,31 +0,0 @@
1
- require_relative 'simple'
2
- module Selector
3
- #
4
- # Selector which uses a n-gram dictionary to generate feature vectors
5
- #
6
- # @author Andreas Eger
7
- #
8
- class NGram < Selector::Simple
9
- attr_reader :gram_size
10
-
11
- def initialize classification, args={}
12
- super
13
- @gram_size = args.fetch(:gram_size) { 2 }
14
- end
15
-
16
- def label
17
- "ngram"
18
- end
19
- #
20
- # fetches all words snippets from one data entry, removes stopwords and very short words
21
- # @param data [PreprocessedData]
22
- # @param gram_size [Integer] gram size
23
- #
24
- # @return [Array<String>]
25
- def extract_words_from_data data, gram_size=@gram_size
26
- (data.data.flat_map(&:split) - stopwords)
27
- .delete_if { |e| e.size <= 3 }
28
- .each_cons(gram_size).map{|e| e.join " " }
29
- end
30
- end
31
- end
@@ -1,41 +0,0 @@
1
- require_relative 'simple'
2
- module Selector
3
- #
4
- # Selector which uses a n-gram dictionary to generate feature vectors
5
- #
6
- # @author Andreas Eger
7
- #
8
- class WithBinaryEncoding < Selector::Simple
9
-
10
- CLASSIFICATIONS_SIZE = {
11
- function: 8, # max id 255, currently 19
12
- industry: 16, # max id 65535, currently 14370
13
- career_level: 4 } # max id 15, currently 8
14
-
15
- def initialize *args
16
- super
17
- end
18
-
19
- def label
20
- "simple-WithBinaryEncoding"
21
- end
22
-
23
- private
24
- #
25
- # creates the classification specific part of the feature vector
26
- # @param ids [Hash] hash with classification ids
27
- #
28
- # @return [Array<Integer>] binary encoded classification id
29
- def classification_array(id)
30
- number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
31
- end
32
-
33
- def number_to_binary_array(number, size=8)
34
- a=[]
35
- (size-1).downto(0) do |i|
36
- a<<number[i]
37
- end
38
- a
39
- end
40
- end
41
- end