svm_helper 0.1.1 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +1 -0
- data/Guardfile +1 -1
- data/lib/svm_helper.rb +1 -2
- data/lib/svm_helper/parallel_helper.rb +24 -0
- data/lib/svm_helper/preprocessors.rb +1 -0
- data/lib/svm_helper/preprocessors/simple.rb +31 -22
- data/lib/svm_helper/preprocessors/stemming.rb +31 -0
- data/lib/svm_helper/selectors.rb +4 -2
- data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
- data/lib/svm_helper/selectors/bns_ig.rb +50 -0
- data/lib/svm_helper/selectors/calc.rb +71 -0
- data/lib/svm_helper/selectors/information_gain.rb +49 -0
- data/lib/svm_helper/selectors/simple.rb +80 -40
- data/lib/svm_helper/stopwords/fr +124 -0
- data/lib/svm_helper/version.rb +1 -1
- data/spec/factories.rb +4 -3
- data/spec/support/selector_spec.rb +2 -4
- data/spec/svm_helper/parallel_helper_spec.rb +17 -0
- data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
- data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
- data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
- data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
- data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
- data/spec/svm_helper/selectors/calc_spec.rb +42 -0
- data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
- data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
- data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
- data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
- data/svm_helper.gemspec +1 -1
- metadata +39 -32
- data/lib/svm_helper/selectors/n_gram.rb +0 -31
- data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
|
4
|
+
data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
|
7
|
+
data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8
|
data/Gemfile
CHANGED
data/Guardfile
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
guard 'rspec', cli: "--color --format p", all_after_pass: false do
|
2
2
|
# guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
3
3
|
watch(%r{^spec/.+_spec\.rb$})
|
4
|
-
watch(%r{^lib/
|
4
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
5
5
|
watch('spec/spec_helper.rb') { 'spec' }
|
6
6
|
watch('spec/factories.rb') { 'spec' }
|
7
7
|
watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
|
data/lib/svm_helper.rb
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
module ParallelHelper
|
2
|
+
THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
|
3
|
+
def p_map_with_index data, &block
|
4
|
+
if parallel? && RUBY_PLATFORM == 'java'
|
5
|
+
Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
|
6
|
+
elsif parallel?
|
7
|
+
Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
|
8
|
+
else
|
9
|
+
data.map.with_index {|e,i| yield e,i }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
def p_map data, &block
|
13
|
+
if parallel? && RUBY_PLATFORM == 'java'
|
14
|
+
Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
|
15
|
+
elsif parallel?
|
16
|
+
Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
|
17
|
+
else
|
18
|
+
data.map {|e| yield e }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def parallel?
|
22
|
+
defined?(Parallel) == 'constant' && @parallel
|
23
|
+
end
|
24
|
+
end
|
@@ -6,7 +6,7 @@ module Preprocessor
|
|
6
6
|
# @author Andreas Eger
|
7
7
|
#
|
8
8
|
class Simple
|
9
|
-
|
9
|
+
include ::ParallelHelper
|
10
10
|
# filters most gender stuff
|
11
11
|
GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
|
12
12
|
# filters most wierd symbols
|
@@ -25,8 +25,16 @@ module Preprocessor
|
|
25
25
|
# filter for used job tokens
|
26
26
|
CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
|
27
27
|
|
28
|
+
# stopword file
|
29
|
+
#TODO use File.expand_path
|
30
|
+
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
31
|
+
attr_accessor :language
|
32
|
+
|
33
|
+
|
28
34
|
def initialize args={}
|
35
|
+
@language = args.fetch(:language){'en'}
|
29
36
|
@parallel = args.fetch(:parallel){false}
|
37
|
+
@stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
|
30
38
|
end
|
31
39
|
|
32
40
|
def label
|
@@ -48,12 +56,20 @@ module Preprocessor
|
|
48
56
|
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
|
49
57
|
def process jobs
|
50
58
|
if jobs.is_a? Array
|
51
|
-
|
59
|
+
p_map(jobs) {|job| process_job job }
|
52
60
|
else
|
53
61
|
process_job jobs
|
54
62
|
end
|
55
63
|
end
|
56
64
|
|
65
|
+
#
|
66
|
+
# loads a txt file with stop words
|
67
|
+
# @param location String folder with stopword lists
|
68
|
+
#
|
69
|
+
# @return [Array<String>] Array of stopwords
|
70
|
+
def strip_stopwords(text)
|
71
|
+
(text.split - @stopwords).delete_if { |e| e.size <= 2 }
|
72
|
+
end
|
57
73
|
|
58
74
|
#
|
59
75
|
# converts string into a cleaner version
|
@@ -75,29 +91,22 @@ module Preprocessor
|
|
75
91
|
#
|
76
92
|
# @return [String] clean and lowercase version of input
|
77
93
|
def clean_description desc
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
94
|
+
strip_stopwords(
|
95
|
+
desc.gsub(XML_TAG_FILTER,' ')
|
96
|
+
.gsub(EMAIL_FILTER,'')
|
97
|
+
.gsub(URL_FILTER,'')
|
98
|
+
.gsub(GENDER_FILTER,'')
|
99
|
+
.gsub(NEW_LINES,'')
|
100
|
+
.gsub(SYMBOL_FILTER,' ')
|
101
|
+
.gsub(WHITESPACE,' ')
|
102
|
+
.gsub(WORDS_IN_BRACKETS, '\1')
|
103
|
+
.gsub(CODE_TOKEN_FILTER,'')
|
104
|
+
.downcase
|
105
|
+
.strip
|
106
|
+
)
|
89
107
|
end
|
90
108
|
|
91
109
|
private
|
92
|
-
def process_jobs jobs
|
93
|
-
if @parallel && RUBY_PLATFORM == 'java'
|
94
|
-
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
|
95
|
-
elsif @parallel
|
96
|
-
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
|
97
|
-
else
|
98
|
-
jobs.map {|job| process_job job }
|
99
|
-
end
|
100
|
-
end
|
101
110
|
|
102
111
|
def process_job job
|
103
112
|
PreprocessedData.new(
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
require 'lingua/stemmer'
|
3
|
+
module Preprocessor
|
4
|
+
#
|
5
|
+
# Preprocessor Base Class
|
6
|
+
#
|
7
|
+
# @author Andreas Eger
|
8
|
+
#
|
9
|
+
class Stemming < Simple
|
10
|
+
|
11
|
+
def initialize(args={})
|
12
|
+
super
|
13
|
+
@stemmer = Lingua::Stemmer.new(language: @language)
|
14
|
+
end
|
15
|
+
def label
|
16
|
+
"with_stemming"
|
17
|
+
end
|
18
|
+
|
19
|
+
def clean_description desc
|
20
|
+
super.map{|w| @stemmer.stem(w) }
|
21
|
+
end
|
22
|
+
private
|
23
|
+
def process_job job
|
24
|
+
PreprocessedData.new(
|
25
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
26
|
+
id: job[:id],
|
27
|
+
label: job[:label]
|
28
|
+
)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/svm_helper/selectors.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
1
|
require_relative 'selectors/simple'
|
2
|
-
require_relative 'selectors/
|
3
|
-
require_relative 'selectors/
|
2
|
+
require_relative 'selectors/calc'
|
3
|
+
require_relative 'selectors/bi_normal_seperation'
|
4
|
+
require_relative 'selectors/information_gain'
|
5
|
+
require_relative 'selectors/bns_ig'
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class BiNormalSeperation < Selector::Simple
|
8
|
+
include BNS
|
9
|
+
|
10
|
+
def label
|
11
|
+
"BiNormalSeperation"
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize classification, args={}
|
15
|
+
super
|
16
|
+
@word_selection = args.fetch(:word_selection){ :grams1_2 }
|
17
|
+
end
|
18
|
+
#
|
19
|
+
# generates a list of feature vetors and their labels from preprocessed data
|
20
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
21
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
22
|
+
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
23
|
+
#
|
24
|
+
# @return [Array<FeatureVector>] list of feature vectors and labels
|
25
|
+
def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
26
|
+
words_and_label_per_data = extract_words data_set, true
|
27
|
+
generate_global_dictionary words_and_label_per_data, dictionary_size
|
28
|
+
|
29
|
+
words_per_data = words_and_label_per_data.map(&:features)
|
30
|
+
p_map_with_index(words_per_data) do |words,index|
|
31
|
+
word_set = words.uniq
|
32
|
+
make_vector word_set, data_set[index]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# generates a list of words used as dictionary
|
38
|
+
# @param all_words (see #extract_words)
|
39
|
+
# @param size dictionary size
|
40
|
+
#
|
41
|
+
# @return [Array<String>] list of words
|
42
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
43
|
+
return unless global_dictionary.empty?
|
44
|
+
|
45
|
+
label_counts = [0,0]
|
46
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
47
|
+
label = bag.label ? 1 : 0
|
48
|
+
label_counts[label] += 1
|
49
|
+
# only count a feature once per bag
|
50
|
+
bag.features.uniq.each do |word|
|
51
|
+
unless accumulator.has_key?(word)
|
52
|
+
accumulator[word] = [0,0]
|
53
|
+
end
|
54
|
+
accumulator[word][label] += 1
|
55
|
+
end
|
56
|
+
accumulator
|
57
|
+
end
|
58
|
+
neg, pos = label_counts
|
59
|
+
words = p_map(features) do |word, counts|
|
60
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
61
|
+
bns = bi_normal_seperation(pos, neg, *counts)
|
62
|
+
[word, bns.abs]
|
63
|
+
end
|
64
|
+
@global_dictionary = words.compact
|
65
|
+
.sort_by{|e| e[1]}
|
66
|
+
.last(size)
|
67
|
+
.map{|e| e[0] }
|
68
|
+
end
|
69
|
+
|
70
|
+
def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
71
|
+
words_per_data = extract_words data_set, true
|
72
|
+
generate_global_dictionary words_per_data, dictionary_size
|
73
|
+
end
|
74
|
+
#
|
75
|
+
# extracts the words of all provided data entries
|
76
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
77
|
+
# @param keep_label
|
78
|
+
#
|
79
|
+
# @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
|
80
|
+
def extract_words data_set, keep_label=false
|
81
|
+
data_set.map do |data|
|
82
|
+
extract_words_from_data data, keep_label
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative 'bi_normal_seperation'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class BNS_IG < Selector::BiNormalSeperation
|
8
|
+
include IG
|
9
|
+
|
10
|
+
def label
|
11
|
+
"BiNormalSeperation_InformationGain"
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# generates a list of words used as dictionary
|
16
|
+
# @param all_words (see #extract_words)
|
17
|
+
# @param size dictionary size
|
18
|
+
#
|
19
|
+
# @return [Array<String>] list of words
|
20
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
21
|
+
return unless global_dictionary.empty?
|
22
|
+
|
23
|
+
label_counts = [0,0]
|
24
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
25
|
+
label = bag.label ? 1 : 0
|
26
|
+
label_counts[label] += 1
|
27
|
+
# only count a feature once per bag
|
28
|
+
bag.features.uniq.each do |word|
|
29
|
+
unless accumulator.has_key?(word)
|
30
|
+
accumulator[word] = [0,0]
|
31
|
+
end
|
32
|
+
accumulator[word][label] += 1
|
33
|
+
end
|
34
|
+
accumulator
|
35
|
+
end
|
36
|
+
neg, pos = label_counts
|
37
|
+
words = p_map(features) do |word, counts|
|
38
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
39
|
+
bns = bi_normal_seperation(pos, neg, *counts)
|
40
|
+
ig = information_gain(pos, neg, *counts)
|
41
|
+
# use geometric mean of BNS and IG
|
42
|
+
[word, Math.sqrt(bns.abs * ig.abs)]
|
43
|
+
end
|
44
|
+
@global_dictionary = words.compact
|
45
|
+
.sort_by{|e| e[1]}
|
46
|
+
.last(size)
|
47
|
+
.map{|e| e[0] }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Selector
|
2
|
+
module IG
|
3
|
+
def information_gain(pos, neg, tp, fp)
|
4
|
+
fn = neg - fp
|
5
|
+
tn = pos - tp
|
6
|
+
p_word = (tp + fp).quo(pos + neg)
|
7
|
+
|
8
|
+
e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
|
9
|
+
end
|
10
|
+
def e(x,y)
|
11
|
+
-xlx(x.quo(x+y)) -xlx(y.quo(x+y))
|
12
|
+
end
|
13
|
+
def xlx(x)
|
14
|
+
x * Math.log2(x)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
module BNS
|
18
|
+
SQR2 = Math.sqrt(2)
|
19
|
+
SQR2PI = Math.sqrt(2.0*Math::PI)
|
20
|
+
|
21
|
+
def bi_normal_seperation pos, neg, tp, fp
|
22
|
+
false_prositive_rate = fp.quo(neg)
|
23
|
+
true_prositive_rate = tp.quo(pos)
|
24
|
+
bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
|
25
|
+
end
|
26
|
+
# standard normal cumulative distribution function
|
27
|
+
def cdf(z)
|
28
|
+
0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
|
29
|
+
end
|
30
|
+
|
31
|
+
# inverse standard normal cumulative distribution function
|
32
|
+
# http://home.online.no/~pjacklam/notes/invnorm
|
33
|
+
|
34
|
+
# Coefficients in rational approximations.
|
35
|
+
A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
|
36
|
+
B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
|
37
|
+
C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
|
38
|
+
D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
|
39
|
+
# Define break-points.
|
40
|
+
P_LOW = 0.02425
|
41
|
+
P_HIGH = 1.0 - P_LOW
|
42
|
+
|
43
|
+
def cdf_inverse(p)
|
44
|
+
return 0.0 if p < 0 || p > 1 || p == 0.5
|
45
|
+
x = 0.0
|
46
|
+
|
47
|
+
if 0.0 < p && p < P_LOW
|
48
|
+
# Rational approximation for lower region.
|
49
|
+
q = Math.sqrt(-2.0*Math.log(p))
|
50
|
+
x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
|
51
|
+
((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
|
52
|
+
elsif P_LOW <= p && p <= P_HIGH
|
53
|
+
# Rational approximation for central region.
|
54
|
+
q = p - 0.5
|
55
|
+
r = q*q
|
56
|
+
x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
|
57
|
+
(((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
|
58
|
+
elsif P_HIGH < p && p < 1.0
|
59
|
+
# Rational approximation for upper region.
|
60
|
+
q = Math.sqrt(-2.0*Math.log(1.0-p))
|
61
|
+
x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
|
62
|
+
((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
|
63
|
+
end
|
64
|
+
if 0 < p && p < 1
|
65
|
+
u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
|
66
|
+
x = x - u/(1.0 + x*u/2.0)
|
67
|
+
end
|
68
|
+
x
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require_relative 'bi_normal_seperation'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class InformationGain < Selector::BiNormalSeperation
|
8
|
+
include IG
|
9
|
+
|
10
|
+
def label
|
11
|
+
"InformationGain"
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# generates a list of words used as dictionary
|
16
|
+
# @param all_words (see #extract_words)
|
17
|
+
# @param size dictionary size
|
18
|
+
#
|
19
|
+
# @return [Array<String>] list of words
|
20
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
21
|
+
return unless global_dictionary.empty?
|
22
|
+
|
23
|
+
label_counts = [0,0]
|
24
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
25
|
+
label = bag.label ? 1 : 0
|
26
|
+
label_counts[label] += 1
|
27
|
+
# only count a feature once per bag
|
28
|
+
bag.features.uniq.each do |word|
|
29
|
+
unless accumulator.has_key?(word)
|
30
|
+
accumulator[word] = [0,0]
|
31
|
+
end
|
32
|
+
accumulator[word][label] += 1
|
33
|
+
end
|
34
|
+
accumulator
|
35
|
+
end
|
36
|
+
neg, pos = label_counts
|
37
|
+
words = p_map(features) do |word, counts|
|
38
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
39
|
+
tp, fp = counts
|
40
|
+
ig = information_gain(pos, neg, tp, fp)
|
41
|
+
[word, ig.abs]
|
42
|
+
end
|
43
|
+
@global_dictionary = words.compact
|
44
|
+
.sort_by{|e| e[1]}
|
45
|
+
.last(size)
|
46
|
+
.map{|e| e[0] }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -5,31 +5,20 @@ module Selector
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
#
|
7
7
|
class Simple
|
8
|
-
|
9
|
-
# stopword file
|
10
|
-
#TODO use File.expand_path
|
11
|
-
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
8
|
+
include ::ParallelHelper
|
12
9
|
# default dictionary size
|
13
10
|
DEFAULT_DICTIONARY_SIZE = 800
|
14
11
|
|
15
|
-
CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
|
16
|
-
{ function: Pjpp::Function.count,
|
17
|
-
industry: Pjpp::Industry.count,
|
18
|
-
career_level: Pjpp::CareerLevel.count }
|
19
|
-
else
|
20
|
-
{ function: 19, # 1..19
|
21
|
-
industry: 632, # 1..14370 but not all ids used
|
22
|
-
career_level: 8 } # 1..8
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
12
|
attr_accessor :global_dictionary
|
28
|
-
|
13
|
+
attr_reader :classification_encoding,
|
14
|
+
:gram_size,
|
15
|
+
:word_selection
|
29
16
|
def initialize classification, args={}
|
30
17
|
@classification = classification
|
31
18
|
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
32
|
-
@
|
19
|
+
@classification_encoding = args.fetch(:classification_encoding){:bitmap}
|
20
|
+
@word_selection = args.fetch(:word_selection){ :single }
|
21
|
+
@gram_size = args.fetch(:gram_size) { 1 }
|
33
22
|
@parallel = args.fetch(:parallel){false}
|
34
23
|
end
|
35
24
|
|
@@ -48,7 +37,7 @@ module Selector
|
|
48
37
|
words_per_data = extract_words data_set
|
49
38
|
generate_global_dictionary words_per_data, dictionary_size
|
50
39
|
|
51
|
-
|
40
|
+
p_map_with_index(words_per_data) do |words,index|
|
52
41
|
word_set = words.uniq
|
53
42
|
make_vector word_set, data_set[index]
|
54
43
|
end
|
@@ -66,15 +55,6 @@ module Selector
|
|
66
55
|
make_vector word_set, data, dictionary
|
67
56
|
end
|
68
57
|
|
69
|
-
#
|
70
|
-
# loads a txt file with stop words
|
71
|
-
# @param location String folder with stopword lists
|
72
|
-
#
|
73
|
-
# @return [Array<String>] Array of stopwords
|
74
|
-
def stopwords(location=STOPWORD_LOCATION)
|
75
|
-
@stopwords ||= IO.read(File.join(location,@language)).split
|
76
|
-
end
|
77
|
-
|
78
58
|
#
|
79
59
|
# generates a list of words used as dictionary
|
80
60
|
# @param all_words (see #extract_words)
|
@@ -90,6 +70,10 @@ module Selector
|
|
90
70
|
@global_dictionary = words.last(size).map(&:first).reverse
|
91
71
|
end
|
92
72
|
|
73
|
+
def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
74
|
+
words_per_data = extract_words data_set
|
75
|
+
generate_global_dictionary words_per_data, dictionary_size
|
76
|
+
end
|
93
77
|
#
|
94
78
|
# extracts the words of all provided data entries
|
95
79
|
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
@@ -107,7 +91,46 @@ module Selector
|
|
107
91
|
#
|
108
92
|
# @return [Array<String>] list of words
|
109
93
|
def extract_words_from_data data
|
110
|
-
(data.data.flat_map(&:split) - stopwords)
|
94
|
+
words = (data.data.flat_map(&:split) - stopwords)
|
95
|
+
.delete_if { |e| e.size <= 2 }
|
96
|
+
if gram_size > 1
|
97
|
+
words = words.each_cons(@gram_size).map{|e| e.join " " }
|
98
|
+
end
|
99
|
+
words
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# fetches all words and two word phrases from one data entry, removes stopwords and very short words
|
104
|
+
# @param data [PreprocessedData] preprocessed data entry
|
105
|
+
# @param keep_label
|
106
|
+
#
|
107
|
+
# @return [OpenStruct<Array<String>,Boolean>] list of words
|
108
|
+
def extract_words_from_data data, keep_label=false
|
109
|
+
# assume the first token is the title an preserve it
|
110
|
+
title, *words = data.data.flatten
|
111
|
+
features = case word_selection
|
112
|
+
when :grams
|
113
|
+
words.each_cons(@gram_size).map{|e| e.join " " }
|
114
|
+
when :grams1_2
|
115
|
+
words + words.each_cons(2).map{|e| e.join " " }
|
116
|
+
when :grams1_2_3
|
117
|
+
words +
|
118
|
+
words.each_cons(2).map{|e| e.join " " } +
|
119
|
+
words.each_cons(3).map{|e| e.join " " }
|
120
|
+
when :grams1_2_3_4
|
121
|
+
words +
|
122
|
+
words.each_cons(2).map{|e| e.join " " } +
|
123
|
+
words.each_cons(3).map{|e| e.join " " } +
|
124
|
+
words.each_cons(4).map{|e| e.join " " }
|
125
|
+
else
|
126
|
+
words
|
127
|
+
end
|
128
|
+
features.unshift(title)
|
129
|
+
return features unless keep_label
|
130
|
+
OpenStruct.new(
|
131
|
+
features: features,
|
132
|
+
label: data.label
|
133
|
+
)
|
111
134
|
end
|
112
135
|
|
113
136
|
def reset classification
|
@@ -135,23 +158,40 @@ module Selector
|
|
135
158
|
)
|
136
159
|
end
|
137
160
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
161
|
+
BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
|
162
|
+
{ function: Pjpp::Function.count,
|
163
|
+
industry: Pjpp::Industry.count,
|
164
|
+
career_level: Pjpp::CareerLevel.count }
|
165
|
+
else
|
166
|
+
{ function: 19, # 1..19
|
167
|
+
industry: 632, # 1..14370 but not all ids used
|
168
|
+
career_level: 8 } # 1..8
|
169
|
+
end
|
147
170
|
|
171
|
+
BINARY_ARRAY_SIZES = {
|
172
|
+
function: 8, # max id 255, currently 19
|
173
|
+
industry: 16, # max id 65535, currently 14370
|
174
|
+
career_level: 4 } # max id 15, currently 8
|
148
175
|
#
|
149
176
|
# creates the classification specific part of the feature vector
|
150
177
|
# @param ids [Hash] hash with classification ids
|
151
178
|
#
|
152
179
|
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
153
180
|
def classification_array(id)
|
154
|
-
|
181
|
+
case @classification_encoding
|
182
|
+
when :binary
|
183
|
+
number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
|
184
|
+
else # :bitmap
|
185
|
+
Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def number_to_binary_array(number, size=8)
|
190
|
+
a=[]
|
191
|
+
(size-1).downto(0) do |i|
|
192
|
+
a<<number[i]
|
193
|
+
end
|
194
|
+
a
|
155
195
|
end
|
156
196
|
end
|
157
|
-
end
|
197
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
alors
|
2
|
+
au
|
3
|
+
aucuns
|
4
|
+
aussi
|
5
|
+
autre
|
6
|
+
avant
|
7
|
+
avec
|
8
|
+
avoir
|
9
|
+
bon
|
10
|
+
car
|
11
|
+
ce
|
12
|
+
cela
|
13
|
+
ces
|
14
|
+
ceux
|
15
|
+
chaque
|
16
|
+
ci
|
17
|
+
comme
|
18
|
+
comment
|
19
|
+
dans
|
20
|
+
des
|
21
|
+
du
|
22
|
+
dedans
|
23
|
+
dehors
|
24
|
+
depuis
|
25
|
+
deux
|
26
|
+
devrait
|
27
|
+
doit
|
28
|
+
donc
|
29
|
+
dos
|
30
|
+
droite
|
31
|
+
début
|
32
|
+
elle
|
33
|
+
elles
|
34
|
+
en
|
35
|
+
encore
|
36
|
+
essai
|
37
|
+
est
|
38
|
+
et
|
39
|
+
eu
|
40
|
+
fait
|
41
|
+
faites
|
42
|
+
fois
|
43
|
+
font
|
44
|
+
force
|
45
|
+
haut
|
46
|
+
hors
|
47
|
+
ici
|
48
|
+
il
|
49
|
+
ils
|
50
|
+
je juste
|
51
|
+
la
|
52
|
+
le
|
53
|
+
les
|
54
|
+
leur
|
55
|
+
là
|
56
|
+
ma
|
57
|
+
maintenant
|
58
|
+
mais
|
59
|
+
mes
|
60
|
+
mine
|
61
|
+
moins
|
62
|
+
mon
|
63
|
+
mot
|
64
|
+
même
|
65
|
+
ni
|
66
|
+
nommés
|
67
|
+
notre
|
68
|
+
nous
|
69
|
+
nouveaux
|
70
|
+
ou
|
71
|
+
où
|
72
|
+
par
|
73
|
+
parce
|
74
|
+
parole
|
75
|
+
pas
|
76
|
+
personnes
|
77
|
+
peut
|
78
|
+
peu
|
79
|
+
pièce
|
80
|
+
plupart
|
81
|
+
pour
|
82
|
+
pourquoi
|
83
|
+
quand
|
84
|
+
que
|
85
|
+
quel
|
86
|
+
quelle
|
87
|
+
quelles
|
88
|
+
quels
|
89
|
+
qui
|
90
|
+
sa
|
91
|
+
sans
|
92
|
+
ses
|
93
|
+
seulement
|
94
|
+
si
|
95
|
+
sien
|
96
|
+
son
|
97
|
+
sont
|
98
|
+
sous
|
99
|
+
soyez sujet
|
100
|
+
sur
|
101
|
+
ta
|
102
|
+
tandis
|
103
|
+
tellement
|
104
|
+
tels
|
105
|
+
tes
|
106
|
+
ton
|
107
|
+
tous
|
108
|
+
tout
|
109
|
+
trop
|
110
|
+
très
|
111
|
+
tu
|
112
|
+
valeur
|
113
|
+
voie
|
114
|
+
voient
|
115
|
+
vont
|
116
|
+
votre
|
117
|
+
vous
|
118
|
+
vu
|
119
|
+
ça
|
120
|
+
étaient
|
121
|
+
état
|
122
|
+
étions
|
123
|
+
été
|
124
|
+
être
|
data/lib/svm_helper/version.rb
CHANGED
data/spec/factories.rb
CHANGED
@@ -18,14 +18,15 @@ FactoryGirl.define do
|
|
18
18
|
|
19
19
|
|
20
20
|
factory :data, class: PreprocessedData do
|
21
|
-
data
|
21
|
+
data %w(haus fooo garten baaz pferd fooo)
|
22
22
|
id 7
|
23
23
|
label true
|
24
24
|
end
|
25
25
|
factory :data_w_short_words, parent: :data do
|
26
|
-
data
|
26
|
+
data %w(auto pferd gooo fooo)
|
27
|
+
label false
|
27
28
|
end
|
28
29
|
factory :data_w_multiple_sections, parent: :data do
|
29
|
-
data [
|
30
|
+
data [%w(meeh auto),%w(baaz fooo)]
|
30
31
|
end
|
31
32
|
end
|
@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
|
|
13
13
|
[0,1].should include(e)
|
14
14
|
end
|
15
15
|
end
|
16
|
-
it "should
|
17
|
-
selector.generate_vectors
|
18
|
-
e.should == selector.generate_vector(data)
|
19
|
-
end
|
16
|
+
it "should respond to generate_vectors" do
|
17
|
+
selector.should respond_to(:generate_vectors)
|
20
18
|
end
|
21
19
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
include ParallelHelper
|
5
|
+
describe ParallelHelper do
|
6
|
+
let(:data) { (1..20).to_a }
|
7
|
+
context "parallel map" do
|
8
|
+
it "should return as a normal map" do
|
9
|
+
p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
context "parallel map with index" do
|
13
|
+
it "should return as a normal map with index" do
|
14
|
+
p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
File without changes
|
@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
|
44
45
|
context "#clean_title" do
|
45
46
|
it "should be downcased" do
|
46
47
|
job = FactoryGirl.build(:job_title_downcasing)
|
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
|
|
75
76
|
FactoryGirl.build(:job_description_w_code_token),
|
76
77
|
FactoryGirl.build(:job_description_w_gender) ]
|
77
78
|
}
|
79
|
+
it "should call strip_stopwords" do
|
80
|
+
simple.expects(:strip_stopwords)
|
81
|
+
simple.clean_description(jobs[0][:description])
|
82
|
+
end
|
78
83
|
it "should remove html/xml tags" do
|
79
|
-
desc = simple.clean_description(jobs[0][:description])
|
84
|
+
desc = simple.clean_description(jobs[0][:description]).join ' '
|
80
85
|
desc.should_not match(/<(.*?)>/)
|
81
86
|
end
|
82
87
|
it "should remove new lines" do
|
83
|
-
desc = simple.clean_description(jobs[0][:description])
|
88
|
+
desc = simple.clean_description(jobs[0][:description]).join ' '
|
84
89
|
desc.should_not match(/\r\n|\n|\r/)
|
85
90
|
end
|
86
91
|
it "should remove all special characters" do
|
87
|
-
desc = simple.clean_description(jobs[2][:description])
|
92
|
+
desc = simple.clean_description(jobs[2][:description]).join ' '
|
88
93
|
desc.should_not match(/[^a-z öäü]/i)
|
89
94
|
end
|
90
95
|
it "should remove gender tokens" do
|
91
|
-
desc = simple.clean_description(jobs[3][:description])
|
96
|
+
desc = simple.clean_description(jobs[3][:description]).join ' '
|
92
97
|
desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
|
93
98
|
end
|
94
99
|
it "should remove job code token" do
|
95
|
-
desc = simple.clean_description(jobs[4][:description])
|
100
|
+
desc = simple.clean_description(jobs[4][:description]).join ' '
|
96
101
|
desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
|
97
102
|
end
|
98
103
|
it "should be downcased" do
|
99
|
-
desc = simple.clean_description(jobs[2][:description])
|
104
|
+
desc = simple.clean_description(jobs[2][:description]).join ' '
|
100
105
|
desc.should_not match(/[^a-z öäü]/)
|
101
106
|
end
|
102
107
|
end
|
108
|
+
|
109
|
+
context "strip_stopwords" do
|
110
|
+
it "should remove words like 'and' from the text" do
|
111
|
+
simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
|
112
|
+
end
|
113
|
+
end
|
103
114
|
context "parallel" do
|
104
115
|
let(:parallel) { Preprocessor::Simple.new(parallel: true) }
|
105
116
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Preprocessor::Stemming do
|
4
|
+
it_behaves_like 'a preprocessor'
|
5
|
+
let(:preprocessor) { Preprocessor::Stemming.new }
|
6
|
+
let(:job) { FactoryGirl.build(:job) }
|
7
|
+
let(:jobs) { [job] }
|
8
|
+
it "should reduce words to their stem" do
|
9
|
+
preprocessor.clean_description("developer engineering").should == %w(develop engin)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Selector::BiNormalSeperation do
|
4
|
+
it_behaves_like 'a selector'
|
5
|
+
|
6
|
+
let(:bns) { Selector::BiNormalSeperation.new(:function) }
|
7
|
+
context "#extract_words_from_data" do
|
8
|
+
it "should generate a list of words from the data" do
|
9
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data))
|
10
|
+
words.should have(10).things
|
11
|
+
end
|
12
|
+
it "should remove words with 3 characters or less" do
|
13
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
|
14
|
+
words.should have(6).things
|
15
|
+
end
|
16
|
+
it "should process multiple sections in the data" do
|
17
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
|
18
|
+
words.should have(6).things
|
19
|
+
end
|
20
|
+
end
|
21
|
+
context "#generate_global_dictionary" do
|
22
|
+
let(:data) { [FactoryGirl.build_list(:data,1),
|
23
|
+
FactoryGirl.build_list(:data_w_short_words,4),
|
24
|
+
FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
|
25
|
+
let(:words_per_data) { bns.extract_words(data,true) }
|
26
|
+
it "should return a list of n words" do
|
27
|
+
bns.generate_global_dictionary(words_per_data,2)
|
28
|
+
bns.global_dictionary.should have(2).things
|
29
|
+
end
|
30
|
+
it "should return a list of the n most used words in the data array" do
|
31
|
+
bns.generate_global_dictionary(words_per_data,3)
|
32
|
+
bns.global_dictionary.should eq(%w(fooo auto pferd))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
# just some very basic test to make sure these functions do not fail
|
4
|
+
describe "Calc" do
|
5
|
+
include Selector::IG
|
6
|
+
include Selector::BNS
|
7
|
+
let(:test_data){ [
|
8
|
+
[34, 23, 28, 17],
|
9
|
+
[31, 17, 23, 12],
|
10
|
+
[44, 39, 41, 36],
|
11
|
+
[44, 23, 41, 23],
|
12
|
+
[44, 39, 0, 36],
|
13
|
+
[44, 39, 41, 0],
|
14
|
+
[62, 81, 15, 73]
|
15
|
+
]}
|
16
|
+
|
17
|
+
context Selector::IG do
|
18
|
+
it "should not fail" do
|
19
|
+
test_data.each do |data|
|
20
|
+
->{information_gain(*data)}.should_not raise_error
|
21
|
+
end
|
22
|
+
end
|
23
|
+
it "should return some values" do
|
24
|
+
test_data.each do |data|
|
25
|
+
information_gain(*data).should be_a(Numeric)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context Selector::BNS do
|
31
|
+
it "should not fail" do
|
32
|
+
test_data.each do |data|
|
33
|
+
->{bi_normal_seperation(*data)}.should_not raise_error
|
34
|
+
end
|
35
|
+
end
|
36
|
+
it "should return some values" do
|
37
|
+
test_data.each do |data|
|
38
|
+
bi_normal_seperation(*data).should be_a(Numeric)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -1,9 +1,7 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
4
|
-
|
5
|
-
|
6
|
-
let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
|
3
|
+
describe "n-grams" do
|
4
|
+
let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
|
7
5
|
context "#extract_words_from_data" do
|
8
6
|
it "should generate a list of words from the data" do
|
9
7
|
words = ngram.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -7,9 +7,6 @@ describe Selector::Simple do
|
|
7
7
|
it "should have select_feature_vector implemented" do
|
8
8
|
expect { simple.generate_vectors([]) }.to_not raise_error
|
9
9
|
end
|
10
|
-
context "#stopwords" do
|
11
|
-
it "simply loads them from a file"
|
12
|
-
end
|
13
10
|
context "#extract_words_from_data" do
|
14
11
|
it "should generate a list of words from the data" do
|
15
12
|
words = simple.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -50,6 +47,19 @@ describe Selector::Simple do
|
|
50
47
|
simple.global_dictionary.should eq(%w(fooo auto baaz))
|
51
48
|
end
|
52
49
|
end
|
50
|
+
context "#build_dictionary" do
|
51
|
+
let(:data) { [FactoryGirl.build_list(:data,1),
|
52
|
+
FactoryGirl.build_list(:data_w_short_words,2),
|
53
|
+
FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
|
54
|
+
it "should return a list of n words" do
|
55
|
+
simple.build_dictionary(data,2)
|
56
|
+
simple.global_dictionary.should have(2).things
|
57
|
+
end
|
58
|
+
it "should return a list of the n most used words in the data array" do
|
59
|
+
simple.build_dictionary(data,3)
|
60
|
+
simple.global_dictionary.should eq(%w(fooo auto baaz))
|
61
|
+
end
|
62
|
+
end
|
53
63
|
context "#generate_vector" do
|
54
64
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
55
65
|
let(:data) { FactoryGirl.build(:data) }
|
@@ -109,6 +119,7 @@ describe Selector::Simple do
|
|
109
119
|
context "parallel" do
|
110
120
|
let(:parallel) { Selector::Simple.new(:function, parallel: true) }
|
111
121
|
before(:each) do
|
122
|
+
require 'parallel'
|
112
123
|
simple.stubs(:global_dictionary).returns(dictionary)
|
113
124
|
parallel.stubs(:global_dictionary).returns(dictionary)
|
114
125
|
end
|
@@ -119,4 +130,4 @@ describe Selector::Simple do
|
|
119
130
|
end
|
120
131
|
end
|
121
132
|
end
|
122
|
-
end
|
133
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
4
|
-
|
5
|
-
let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
|
3
|
+
describe "binary encoded classification" do
|
4
|
+
let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
|
6
5
|
|
7
6
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
8
7
|
let(:data) { FactoryGirl.build(:data) }
|
data/svm_helper.gemspec
CHANGED
metadata
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svm_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.1
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Andreas Eger
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-04-25 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
14
|
+
name: ruby-stemmer
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0
|
19
|
+
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: 0
|
26
|
+
version: '0'
|
30
27
|
description: Shared helper classes for usage in context of SVM at experteer
|
31
28
|
email:
|
32
29
|
- dev@eger-andreas.de
|
@@ -48,16 +45,21 @@ files:
|
|
48
45
|
- lib/svm_helper.rb
|
49
46
|
- lib/svm_helper/feature_vector.rb
|
50
47
|
- lib/svm_helper/interface_helper.rb
|
48
|
+
- lib/svm_helper/parallel_helper.rb
|
51
49
|
- lib/svm_helper/preprocessed_data.rb
|
52
50
|
- lib/svm_helper/preprocessors.rb
|
53
51
|
- lib/svm_helper/preprocessors/id_mapping.rb
|
54
52
|
- lib/svm_helper/preprocessors/simple.rb
|
53
|
+
- lib/svm_helper/preprocessors/stemming.rb
|
55
54
|
- lib/svm_helper/selectors.rb
|
56
|
-
- lib/svm_helper/selectors/
|
55
|
+
- lib/svm_helper/selectors/bi_normal_seperation.rb
|
56
|
+
- lib/svm_helper/selectors/bns_ig.rb
|
57
|
+
- lib/svm_helper/selectors/calc.rb
|
58
|
+
- lib/svm_helper/selectors/information_gain.rb
|
57
59
|
- lib/svm_helper/selectors/simple.rb
|
58
|
-
- lib/svm_helper/selectors/with_binary_encoding.rb
|
59
60
|
- lib/svm_helper/stopwords/de
|
60
61
|
- lib/svm_helper/stopwords/en
|
62
|
+
- lib/svm_helper/stopwords/fr
|
61
63
|
- lib/svm_helper/version.rb
|
62
64
|
- spec/factories.rb
|
63
65
|
- spec/factories/jobs/tmp.html
|
@@ -65,44 +67,43 @@ files:
|
|
65
67
|
- spec/factories/jobs/tmp3.html
|
66
68
|
- spec/factories/jobs_with_description.rb
|
67
69
|
- spec/factories/jobs_with_title.rb
|
68
|
-
- spec/preprocessors/id_mapping_spec.rb
|
69
|
-
- spec/preprocessors/simple_spec.rb
|
70
|
-
- spec/selectors/n_gram_spec.rb
|
71
|
-
- spec/selectors/simple_spec.rb
|
72
|
-
- spec/selectors/with_binary_encoding_spec.rb
|
73
70
|
- spec/spec_helper.rb
|
74
71
|
- spec/support/preprocessor_spec.rb
|
75
72
|
- spec/support/selector_spec.rb
|
73
|
+
- spec/svm_helper/parallel_helper_spec.rb
|
74
|
+
- spec/svm_helper/preprocessors/id_mapping_spec.rb
|
75
|
+
- spec/svm_helper/preprocessors/simple_spec.rb
|
76
|
+
- spec/svm_helper/preprocessors/stemming_spec.rb
|
77
|
+
- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
|
78
|
+
- spec/svm_helper/selectors/bns_ig_spec.rb
|
79
|
+
- spec/svm_helper/selectors/calc_spec.rb
|
80
|
+
- spec/svm_helper/selectors/information_gain_spec.rb
|
81
|
+
- spec/svm_helper/selectors/n_gram_spec.rb
|
82
|
+
- spec/svm_helper/selectors/simple_spec.rb
|
83
|
+
- spec/svm_helper/selectors/with_binary_encoding_spec.rb
|
76
84
|
- svm_helper.gemspec
|
77
85
|
homepage: https://github.com/sch1zo/svm_helper
|
78
86
|
licenses: []
|
87
|
+
metadata: {}
|
79
88
|
post_install_message:
|
80
89
|
rdoc_options: []
|
81
90
|
require_paths:
|
82
91
|
- lib
|
83
92
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
-
none: false
|
85
93
|
requirements:
|
86
94
|
- - '>='
|
87
95
|
- !ruby/object:Gem::Version
|
88
96
|
version: '0'
|
89
|
-
segments:
|
90
|
-
- 0
|
91
|
-
hash: 2037039748537332986
|
92
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
-
none: false
|
94
98
|
requirements:
|
95
99
|
- - '>='
|
96
100
|
- !ruby/object:Gem::Version
|
97
101
|
version: '0'
|
98
|
-
segments:
|
99
|
-
- 0
|
100
|
-
hash: 2037039748537332986
|
101
102
|
requirements: []
|
102
103
|
rubyforge_project:
|
103
|
-
rubygems_version:
|
104
|
+
rubygems_version: 2.0.0.rc.2
|
104
105
|
signing_key:
|
105
|
-
specification_version:
|
106
|
+
specification_version: 4
|
106
107
|
summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
|
107
108
|
test_files:
|
108
109
|
- spec/factories.rb
|
@@ -111,12 +112,18 @@ test_files:
|
|
111
112
|
- spec/factories/jobs/tmp3.html
|
112
113
|
- spec/factories/jobs_with_description.rb
|
113
114
|
- spec/factories/jobs_with_title.rb
|
114
|
-
- spec/preprocessors/id_mapping_spec.rb
|
115
|
-
- spec/preprocessors/simple_spec.rb
|
116
|
-
- spec/selectors/n_gram_spec.rb
|
117
|
-
- spec/selectors/simple_spec.rb
|
118
|
-
- spec/selectors/with_binary_encoding_spec.rb
|
119
115
|
- spec/spec_helper.rb
|
120
116
|
- spec/support/preprocessor_spec.rb
|
121
117
|
- spec/support/selector_spec.rb
|
118
|
+
- spec/svm_helper/parallel_helper_spec.rb
|
119
|
+
- spec/svm_helper/preprocessors/id_mapping_spec.rb
|
120
|
+
- spec/svm_helper/preprocessors/simple_spec.rb
|
121
|
+
- spec/svm_helper/preprocessors/stemming_spec.rb
|
122
|
+
- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
|
123
|
+
- spec/svm_helper/selectors/bns_ig_spec.rb
|
124
|
+
- spec/svm_helper/selectors/calc_spec.rb
|
125
|
+
- spec/svm_helper/selectors/information_gain_spec.rb
|
126
|
+
- spec/svm_helper/selectors/n_gram_spec.rb
|
127
|
+
- spec/svm_helper/selectors/simple_spec.rb
|
128
|
+
- spec/svm_helper/selectors/with_binary_encoding_spec.rb
|
122
129
|
has_rdoc:
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Selector
|
3
|
-
#
|
4
|
-
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class NGram < Selector::Simple
|
9
|
-
attr_reader :gram_size
|
10
|
-
|
11
|
-
def initialize classification, args={}
|
12
|
-
super
|
13
|
-
@gram_size = args.fetch(:gram_size) { 2 }
|
14
|
-
end
|
15
|
-
|
16
|
-
def label
|
17
|
-
"ngram"
|
18
|
-
end
|
19
|
-
#
|
20
|
-
# fetches all words snippets from one data entry, removes stopwords and very short words
|
21
|
-
# @param data [PreprocessedData]
|
22
|
-
# @param gram_size [Integer] gram size
|
23
|
-
#
|
24
|
-
# @return [Array<String>]
|
25
|
-
def extract_words_from_data data, gram_size=@gram_size
|
26
|
-
(data.data.flat_map(&:split) - stopwords)
|
27
|
-
.delete_if { |e| e.size <= 3 }
|
28
|
-
.each_cons(gram_size).map{|e| e.join " " }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Selector
|
3
|
-
#
|
4
|
-
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class WithBinaryEncoding < Selector::Simple
|
9
|
-
|
10
|
-
CLASSIFICATIONS_SIZE = {
|
11
|
-
function: 8, # max id 255, currently 19
|
12
|
-
industry: 16, # max id 65535, currently 14370
|
13
|
-
career_level: 4 } # max id 15, currently 8
|
14
|
-
|
15
|
-
def initialize *args
|
16
|
-
super
|
17
|
-
end
|
18
|
-
|
19
|
-
def label
|
20
|
-
"simple-WithBinaryEncoding"
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
#
|
25
|
-
# creates the classification specific part of the feature vector
|
26
|
-
# @param ids [Hash] hash with classification ids
|
27
|
-
#
|
28
|
-
# @return [Array<Integer>] binary encoded classification id
|
29
|
-
def classification_array(id)
|
30
|
-
number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
|
31
|
-
end
|
32
|
-
|
33
|
-
def number_to_binary_array(number, size=8)
|
34
|
-
a=[]
|
35
|
-
(size-1).downto(0) do |i|
|
36
|
-
a<<number[i]
|
37
|
-
end
|
38
|
-
a
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|