svm_helper 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +1 -0
- data/Guardfile +1 -1
- data/lib/svm_helper.rb +1 -2
- data/lib/svm_helper/parallel_helper.rb +24 -0
- data/lib/svm_helper/preprocessors.rb +1 -0
- data/lib/svm_helper/preprocessors/simple.rb +31 -22
- data/lib/svm_helper/preprocessors/stemming.rb +31 -0
- data/lib/svm_helper/selectors.rb +4 -2
- data/lib/svm_helper/selectors/bi_normal_seperation.rb +86 -0
- data/lib/svm_helper/selectors/bns_ig.rb +50 -0
- data/lib/svm_helper/selectors/calc.rb +71 -0
- data/lib/svm_helper/selectors/information_gain.rb +49 -0
- data/lib/svm_helper/selectors/simple.rb +80 -40
- data/lib/svm_helper/stopwords/fr +124 -0
- data/lib/svm_helper/version.rb +1 -1
- data/spec/factories.rb +4 -3
- data/spec/support/selector_spec.rb +2 -4
- data/spec/svm_helper/parallel_helper_spec.rb +17 -0
- data/spec/{preprocessors → svm_helper/preprocessors}/id_mapping_spec.rb +0 -0
- data/spec/{preprocessors → svm_helper/preprocessors}/simple_spec.rb +17 -6
- data/spec/svm_helper/preprocessors/stemming_spec.rb +11 -0
- data/spec/svm_helper/selectors/bi_normal_seperation_spec.rb +35 -0
- data/spec/svm_helper/selectors/bns_ig_spec.rb +5 -0
- data/spec/svm_helper/selectors/calc_spec.rb +42 -0
- data/spec/svm_helper/selectors/information_gain_spec.rb +5 -0
- data/spec/{selectors → svm_helper/selectors}/n_gram_spec.rb +2 -4
- data/spec/{selectors → svm_helper/selectors}/simple_spec.rb +15 -4
- data/spec/{selectors → svm_helper/selectors}/with_binary_encoding_spec.rb +2 -3
- data/svm_helper.gemspec +1 -1
- metadata +39 -32
- data/lib/svm_helper/selectors/n_gram.rb +0 -31
- data/lib/svm_helper/selectors/with_binary_encoding.rb +0 -41
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 377f21c5f2bb4431166019336b71ad3892bc35ac
|
4
|
+
data.tar.gz: 02973ce1db9e6720bbe216649b533e7f5b9d35c9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 818e5bdb6fbfb12e3ca7a0a2f19a1dae46c63646ddfb79eccba9cdc3ba5906d13e004c5bea5cef24099ea8c75a04d14a619e13fa9cfe351c9777439d056da2cc
|
7
|
+
data.tar.gz: 176044f5c9662e590855152576dee2d4f00da1a7cf123001ed9cbce5eca1624571c90494302390e8860e75ba2f83e158ab986a509b6a58f263f6ef225dfcd0c8
|
data/Gemfile
CHANGED
data/Guardfile
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
guard 'rspec', cli: "--color --format p", all_after_pass: false do
|
2
2
|
# guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
3
3
|
watch(%r{^spec/.+_spec\.rb$})
|
4
|
-
watch(%r{^lib/
|
4
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
5
5
|
watch('spec/spec_helper.rb') { 'spec' }
|
6
6
|
watch('spec/factories.rb') { 'spec' }
|
7
7
|
watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
|
data/lib/svm_helper.rb
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
module ParallelHelper
|
2
|
+
THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
|
3
|
+
def p_map_with_index data, &block
|
4
|
+
if parallel? && RUBY_PLATFORM == 'java'
|
5
|
+
Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
|
6
|
+
elsif parallel?
|
7
|
+
Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
|
8
|
+
else
|
9
|
+
data.map.with_index {|e,i| yield e,i }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
def p_map data, &block
|
13
|
+
if parallel? && RUBY_PLATFORM == 'java'
|
14
|
+
Parallel.map(data, in_threads: THREAD_COUNT ){|e| yield e }
|
15
|
+
elsif parallel?
|
16
|
+
Parallel.map(data, in_processes: THREAD_COUNT ){|e| yield e }
|
17
|
+
else
|
18
|
+
data.map {|e| yield e }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def parallel?
|
22
|
+
defined?(Parallel) == 'constant' && @parallel
|
23
|
+
end
|
24
|
+
end
|
@@ -6,7 +6,7 @@ module Preprocessor
|
|
6
6
|
# @author Andreas Eger
|
7
7
|
#
|
8
8
|
class Simple
|
9
|
-
|
9
|
+
include ::ParallelHelper
|
10
10
|
# filters most gender stuff
|
11
11
|
GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
|
12
12
|
# filters most wierd symbols
|
@@ -25,8 +25,16 @@ module Preprocessor
|
|
25
25
|
# filter for used job tokens
|
26
26
|
CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
|
27
27
|
|
28
|
+
# stopword file
|
29
|
+
#TODO use File.expand_path
|
30
|
+
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
31
|
+
attr_accessor :language
|
32
|
+
|
33
|
+
|
28
34
|
def initialize args={}
|
35
|
+
@language = args.fetch(:language){'en'}
|
29
36
|
@parallel = args.fetch(:parallel){false}
|
37
|
+
@stopwords ||= IO.read(File.join(STOPWORD_LOCATION,@language)).split
|
30
38
|
end
|
31
39
|
|
32
40
|
def label
|
@@ -48,12 +56,20 @@ module Preprocessor
|
|
48
56
|
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
|
49
57
|
def process jobs
|
50
58
|
if jobs.is_a? Array
|
51
|
-
|
59
|
+
p_map(jobs) {|job| process_job job }
|
52
60
|
else
|
53
61
|
process_job jobs
|
54
62
|
end
|
55
63
|
end
|
56
64
|
|
65
|
+
#
|
66
|
+
# loads a txt file with stop words
|
67
|
+
# @param location String folder with stopword lists
|
68
|
+
#
|
69
|
+
# @return [Array<String>] Array of stopwords
|
70
|
+
def strip_stopwords(text)
|
71
|
+
(text.split - @stopwords).delete_if { |e| e.size <= 2 }
|
72
|
+
end
|
57
73
|
|
58
74
|
#
|
59
75
|
# converts string into a cleaner version
|
@@ -75,29 +91,22 @@ module Preprocessor
|
|
75
91
|
#
|
76
92
|
# @return [String] clean and lowercase version of input
|
77
93
|
def clean_description desc
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
94
|
+
strip_stopwords(
|
95
|
+
desc.gsub(XML_TAG_FILTER,' ')
|
96
|
+
.gsub(EMAIL_FILTER,'')
|
97
|
+
.gsub(URL_FILTER,'')
|
98
|
+
.gsub(GENDER_FILTER,'')
|
99
|
+
.gsub(NEW_LINES,'')
|
100
|
+
.gsub(SYMBOL_FILTER,' ')
|
101
|
+
.gsub(WHITESPACE,' ')
|
102
|
+
.gsub(WORDS_IN_BRACKETS, '\1')
|
103
|
+
.gsub(CODE_TOKEN_FILTER,'')
|
104
|
+
.downcase
|
105
|
+
.strip
|
106
|
+
)
|
89
107
|
end
|
90
108
|
|
91
109
|
private
|
92
|
-
def process_jobs jobs
|
93
|
-
if @parallel && RUBY_PLATFORM == 'java'
|
94
|
-
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
|
95
|
-
elsif @parallel
|
96
|
-
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
|
97
|
-
else
|
98
|
-
jobs.map {|job| process_job job }
|
99
|
-
end
|
100
|
-
end
|
101
110
|
|
102
111
|
def process_job job
|
103
112
|
PreprocessedData.new(
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
require 'lingua/stemmer'
|
3
|
+
module Preprocessor
|
4
|
+
#
|
5
|
+
# Preprocessor Base Class
|
6
|
+
#
|
7
|
+
# @author Andreas Eger
|
8
|
+
#
|
9
|
+
class Stemming < Simple
|
10
|
+
|
11
|
+
def initialize(args={})
|
12
|
+
super
|
13
|
+
@stemmer = Lingua::Stemmer.new(language: @language)
|
14
|
+
end
|
15
|
+
def label
|
16
|
+
"with_stemming"
|
17
|
+
end
|
18
|
+
|
19
|
+
def clean_description desc
|
20
|
+
super.map{|w| @stemmer.stem(w) }
|
21
|
+
end
|
22
|
+
private
|
23
|
+
def process_job job
|
24
|
+
PreprocessedData.new(
|
25
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
26
|
+
id: job[:id],
|
27
|
+
label: job[:label]
|
28
|
+
)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/svm_helper/selectors.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
1
|
require_relative 'selectors/simple'
|
2
|
-
require_relative 'selectors/
|
3
|
-
require_relative 'selectors/
|
2
|
+
require_relative 'selectors/calc'
|
3
|
+
require_relative 'selectors/bi_normal_seperation'
|
4
|
+
require_relative 'selectors/information_gain'
|
5
|
+
require_relative 'selectors/bns_ig'
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class BiNormalSeperation < Selector::Simple
|
8
|
+
include BNS
|
9
|
+
|
10
|
+
def label
|
11
|
+
"BiNormalSeperation"
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize classification, args={}
|
15
|
+
super
|
16
|
+
@word_selection = args.fetch(:word_selection){ :grams1_2 }
|
17
|
+
end
|
18
|
+
#
|
19
|
+
# generates a list of feature vetors and their labels from preprocessed data
|
20
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
21
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
22
|
+
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
23
|
+
#
|
24
|
+
# @return [Array<FeatureVector>] list of feature vectors and labels
|
25
|
+
def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
26
|
+
words_and_label_per_data = extract_words data_set, true
|
27
|
+
generate_global_dictionary words_and_label_per_data, dictionary_size
|
28
|
+
|
29
|
+
words_per_data = words_and_label_per_data.map(&:features)
|
30
|
+
p_map_with_index(words_per_data) do |words,index|
|
31
|
+
word_set = words.uniq
|
32
|
+
make_vector word_set, data_set[index]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# generates a list of words used as dictionary
|
38
|
+
# @param all_words (see #extract_words)
|
39
|
+
# @param size dictionary size
|
40
|
+
#
|
41
|
+
# @return [Array<String>] list of words
|
42
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
43
|
+
return unless global_dictionary.empty?
|
44
|
+
|
45
|
+
label_counts = [0,0]
|
46
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
47
|
+
label = bag.label ? 1 : 0
|
48
|
+
label_counts[label] += 1
|
49
|
+
# only count a feature once per bag
|
50
|
+
bag.features.uniq.each do |word|
|
51
|
+
unless accumulator.has_key?(word)
|
52
|
+
accumulator[word] = [0,0]
|
53
|
+
end
|
54
|
+
accumulator[word][label] += 1
|
55
|
+
end
|
56
|
+
accumulator
|
57
|
+
end
|
58
|
+
neg, pos = label_counts
|
59
|
+
words = p_map(features) do |word, counts|
|
60
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
61
|
+
bns = bi_normal_seperation(pos, neg, *counts)
|
62
|
+
[word, bns.abs]
|
63
|
+
end
|
64
|
+
@global_dictionary = words.compact
|
65
|
+
.sort_by{|e| e[1]}
|
66
|
+
.last(size)
|
67
|
+
.map{|e| e[0] }
|
68
|
+
end
|
69
|
+
|
70
|
+
def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
71
|
+
words_per_data = extract_words data_set, true
|
72
|
+
generate_global_dictionary words_per_data, dictionary_size
|
73
|
+
end
|
74
|
+
#
|
75
|
+
# extracts the words of all provided data entries
|
76
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
77
|
+
# @param keep_label
|
78
|
+
#
|
79
|
+
# @return [Array<OpenStruct<Array<String>,Boolean>>] list of words per data entry
|
80
|
+
def extract_words data_set, keep_label=false
|
81
|
+
data_set.map do |data|
|
82
|
+
extract_words_from_data data, keep_label
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative 'bi_normal_seperation'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class BNS_IG < Selector::BiNormalSeperation
|
8
|
+
include IG
|
9
|
+
|
10
|
+
def label
|
11
|
+
"BiNormalSeperation_InformationGain"
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# generates a list of words used as dictionary
|
16
|
+
# @param all_words (see #extract_words)
|
17
|
+
# @param size dictionary size
|
18
|
+
#
|
19
|
+
# @return [Array<String>] list of words
|
20
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
21
|
+
return unless global_dictionary.empty?
|
22
|
+
|
23
|
+
label_counts = [0,0]
|
24
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
25
|
+
label = bag.label ? 1 : 0
|
26
|
+
label_counts[label] += 1
|
27
|
+
# only count a feature once per bag
|
28
|
+
bag.features.uniq.each do |word|
|
29
|
+
unless accumulator.has_key?(word)
|
30
|
+
accumulator[word] = [0,0]
|
31
|
+
end
|
32
|
+
accumulator[word][label] += 1
|
33
|
+
end
|
34
|
+
accumulator
|
35
|
+
end
|
36
|
+
neg, pos = label_counts
|
37
|
+
words = p_map(features) do |word, counts|
|
38
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
39
|
+
bns = bi_normal_seperation(pos, neg, *counts)
|
40
|
+
ig = information_gain(pos, neg, *counts)
|
41
|
+
# use geometric mean of BNS and IG
|
42
|
+
[word, Math.sqrt(bns.abs * ig.abs)]
|
43
|
+
end
|
44
|
+
@global_dictionary = words.compact
|
45
|
+
.sort_by{|e| e[1]}
|
46
|
+
.last(size)
|
47
|
+
.map{|e| e[0] }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Selector
|
2
|
+
module IG
|
3
|
+
def information_gain(pos, neg, tp, fp)
|
4
|
+
fn = neg - fp
|
5
|
+
tn = pos - tp
|
6
|
+
p_word = (tp + fp).quo(pos + neg)
|
7
|
+
|
8
|
+
e(pos, neg) - (p_word * e(tp, fp) + (1 - p_word) * e(fn, tn))
|
9
|
+
end
|
10
|
+
def e(x,y)
|
11
|
+
-xlx(x.quo(x+y)) -xlx(y.quo(x+y))
|
12
|
+
end
|
13
|
+
def xlx(x)
|
14
|
+
x * Math.log2(x)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
module BNS
|
18
|
+
SQR2 = Math.sqrt(2)
|
19
|
+
SQR2PI = Math.sqrt(2.0*Math::PI)
|
20
|
+
|
21
|
+
def bi_normal_seperation pos, neg, tp, fp
|
22
|
+
false_prositive_rate = fp.quo(neg)
|
23
|
+
true_prositive_rate = tp.quo(pos)
|
24
|
+
bns = cdf_inverse(true_prositive_rate) - cdf_inverse(false_prositive_rate)
|
25
|
+
end
|
26
|
+
# standard normal cumulative distribution function
|
27
|
+
def cdf(z)
|
28
|
+
0.5 * (1.0 + Math.erf( z.quo(SQR2) ) )
|
29
|
+
end
|
30
|
+
|
31
|
+
# inverse standard normal cumulative distribution function
|
32
|
+
# http://home.online.no/~pjacklam/notes/invnorm
|
33
|
+
|
34
|
+
# Coefficients in rational approximations.
|
35
|
+
A = [0, -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
|
36
|
+
B = [0, -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01]
|
37
|
+
C = [0, -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
|
38
|
+
D = [0, 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
|
39
|
+
# Define break-points.
|
40
|
+
P_LOW = 0.02425
|
41
|
+
P_HIGH = 1.0 - P_LOW
|
42
|
+
|
43
|
+
def cdf_inverse(p)
|
44
|
+
return 0.0 if p < 0 || p > 1 || p == 0.5
|
45
|
+
x = 0.0
|
46
|
+
|
47
|
+
if 0.0 < p && p < P_LOW
|
48
|
+
# Rational approximation for lower region.
|
49
|
+
q = Math.sqrt(-2.0*Math.log(p))
|
50
|
+
x = (((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
|
51
|
+
((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
|
52
|
+
elsif P_LOW <= p && p <= P_HIGH
|
53
|
+
# Rational approximation for central region.
|
54
|
+
q = p - 0.5
|
55
|
+
r = q*q
|
56
|
+
x = (((((A[1]*r+A[2])*r+A[3])*r+A[4])*r+A[5])*r+A[6])*q /
|
57
|
+
(((((B[1]*r+B[2])*r+B[3])*r+B[4])*r+B[5])*r+1.0)
|
58
|
+
elsif P_HIGH < p && p < 1.0
|
59
|
+
# Rational approximation for upper region.
|
60
|
+
q = Math.sqrt(-2.0*Math.log(1.0-p))
|
61
|
+
x = -(((((C[1]*q+C[2])*q+C[3])*q+C[4])*q+C[5])*q+C[6]) /
|
62
|
+
((((D[1]*q+D[2])*q+D[3])*q+D[4])*q+1.0)
|
63
|
+
end
|
64
|
+
if 0 < p && p < 1
|
65
|
+
u = cdf(p) * SQR2PI * Math.exp((x**2.0)/2.0)
|
66
|
+
x = x - u/(1.0 + x*u/2.0)
|
67
|
+
end
|
68
|
+
x
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require_relative 'bi_normal_seperation'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Feature Selection for Text Classification - HP Labs
|
5
|
+
# http://www.google.com/patents/US20040059697
|
6
|
+
#
|
7
|
+
class InformationGain < Selector::BiNormalSeperation
|
8
|
+
include IG
|
9
|
+
|
10
|
+
def label
|
11
|
+
"InformationGain"
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# generates a list of words used as dictionary
|
16
|
+
# @param all_words (see #extract_words)
|
17
|
+
# @param size dictionary size
|
18
|
+
#
|
19
|
+
# @return [Array<String>] list of words
|
20
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
21
|
+
return unless global_dictionary.empty?
|
22
|
+
|
23
|
+
label_counts = [0,0]
|
24
|
+
features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
|
25
|
+
label = bag.label ? 1 : 0
|
26
|
+
label_counts[label] += 1
|
27
|
+
# only count a feature once per bag
|
28
|
+
bag.features.uniq.each do |word|
|
29
|
+
unless accumulator.has_key?(word)
|
30
|
+
accumulator[word] = [0,0]
|
31
|
+
end
|
32
|
+
accumulator[word][label] += 1
|
33
|
+
end
|
34
|
+
accumulator
|
35
|
+
end
|
36
|
+
neg, pos = label_counts
|
37
|
+
words = p_map(features) do |word, counts|
|
38
|
+
next if counts.any? { |e| e==0 } # skip words only appearing in one class
|
39
|
+
tp, fp = counts
|
40
|
+
ig = information_gain(pos, neg, tp, fp)
|
41
|
+
[word, ig.abs]
|
42
|
+
end
|
43
|
+
@global_dictionary = words.compact
|
44
|
+
.sort_by{|e| e[1]}
|
45
|
+
.last(size)
|
46
|
+
.map{|e| e[0] }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -5,31 +5,20 @@ module Selector
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
#
|
7
7
|
class Simple
|
8
|
-
|
9
|
-
# stopword file
|
10
|
-
#TODO use File.expand_path
|
11
|
-
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
8
|
+
include ::ParallelHelper
|
12
9
|
# default dictionary size
|
13
10
|
DEFAULT_DICTIONARY_SIZE = 800
|
14
11
|
|
15
|
-
CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
|
16
|
-
{ function: Pjpp::Function.count,
|
17
|
-
industry: Pjpp::Industry.count,
|
18
|
-
career_level: Pjpp::CareerLevel.count }
|
19
|
-
else
|
20
|
-
{ function: 19, # 1..19
|
21
|
-
industry: 632, # 1..14370 but not all ids used
|
22
|
-
career_level: 8 } # 1..8
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
12
|
attr_accessor :global_dictionary
|
28
|
-
|
13
|
+
attr_reader :classification_encoding,
|
14
|
+
:gram_size,
|
15
|
+
:word_selection
|
29
16
|
def initialize classification, args={}
|
30
17
|
@classification = classification
|
31
18
|
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
32
|
-
@
|
19
|
+
@classification_encoding = args.fetch(:classification_encoding){:bitmap}
|
20
|
+
@word_selection = args.fetch(:word_selection){ :single }
|
21
|
+
@gram_size = args.fetch(:gram_size) { 1 }
|
33
22
|
@parallel = args.fetch(:parallel){false}
|
34
23
|
end
|
35
24
|
|
@@ -48,7 +37,7 @@ module Selector
|
|
48
37
|
words_per_data = extract_words data_set
|
49
38
|
generate_global_dictionary words_per_data, dictionary_size
|
50
39
|
|
51
|
-
|
40
|
+
p_map_with_index(words_per_data) do |words,index|
|
52
41
|
word_set = words.uniq
|
53
42
|
make_vector word_set, data_set[index]
|
54
43
|
end
|
@@ -66,15 +55,6 @@ module Selector
|
|
66
55
|
make_vector word_set, data, dictionary
|
67
56
|
end
|
68
57
|
|
69
|
-
#
|
70
|
-
# loads a txt file with stop words
|
71
|
-
# @param location String folder with stopword lists
|
72
|
-
#
|
73
|
-
# @return [Array<String>] Array of stopwords
|
74
|
-
def stopwords(location=STOPWORD_LOCATION)
|
75
|
-
@stopwords ||= IO.read(File.join(location,@language)).split
|
76
|
-
end
|
77
|
-
|
78
58
|
#
|
79
59
|
# generates a list of words used as dictionary
|
80
60
|
# @param all_words (see #extract_words)
|
@@ -90,6 +70,10 @@ module Selector
|
|
90
70
|
@global_dictionary = words.last(size).map(&:first).reverse
|
91
71
|
end
|
92
72
|
|
73
|
+
def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
74
|
+
words_per_data = extract_words data_set
|
75
|
+
generate_global_dictionary words_per_data, dictionary_size
|
76
|
+
end
|
93
77
|
#
|
94
78
|
# extracts the words of all provided data entries
|
95
79
|
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
@@ -107,7 +91,46 @@ module Selector
|
|
107
91
|
#
|
108
92
|
# @return [Array<String>] list of words
|
109
93
|
def extract_words_from_data data
|
110
|
-
(data.data.flat_map(&:split) - stopwords)
|
94
|
+
words = (data.data.flat_map(&:split) - stopwords)
|
95
|
+
.delete_if { |e| e.size <= 2 }
|
96
|
+
if gram_size > 1
|
97
|
+
words = words.each_cons(@gram_size).map{|e| e.join " " }
|
98
|
+
end
|
99
|
+
words
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# fetches all words and two word phrases from one data entry, removes stopwords and very short words
|
104
|
+
# @param data [PreprocessedData] preprocessed data entry
|
105
|
+
# @param keep_label
|
106
|
+
#
|
107
|
+
# @return [OpenStruct<Array<String>,Boolean>] list of words
|
108
|
+
def extract_words_from_data data, keep_label=false
|
109
|
+
# assume the first token is the title an preserve it
|
110
|
+
title, *words = data.data.flatten
|
111
|
+
features = case word_selection
|
112
|
+
when :grams
|
113
|
+
words.each_cons(@gram_size).map{|e| e.join " " }
|
114
|
+
when :grams1_2
|
115
|
+
words + words.each_cons(2).map{|e| e.join " " }
|
116
|
+
when :grams1_2_3
|
117
|
+
words +
|
118
|
+
words.each_cons(2).map{|e| e.join " " } +
|
119
|
+
words.each_cons(3).map{|e| e.join " " }
|
120
|
+
when :grams1_2_3_4
|
121
|
+
words +
|
122
|
+
words.each_cons(2).map{|e| e.join " " } +
|
123
|
+
words.each_cons(3).map{|e| e.join " " } +
|
124
|
+
words.each_cons(4).map{|e| e.join " " }
|
125
|
+
else
|
126
|
+
words
|
127
|
+
end
|
128
|
+
features.unshift(title)
|
129
|
+
return features unless keep_label
|
130
|
+
OpenStruct.new(
|
131
|
+
features: features,
|
132
|
+
label: data.label
|
133
|
+
)
|
111
134
|
end
|
112
135
|
|
113
136
|
def reset classification
|
@@ -135,23 +158,40 @@ module Selector
|
|
135
158
|
)
|
136
159
|
end
|
137
160
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
161
|
+
BITMAP_ARRAY_SIZES= if defined?(Pjpp) == 'constant'
|
162
|
+
{ function: Pjpp::Function.count,
|
163
|
+
industry: Pjpp::Industry.count,
|
164
|
+
career_level: Pjpp::CareerLevel.count }
|
165
|
+
else
|
166
|
+
{ function: 19, # 1..19
|
167
|
+
industry: 632, # 1..14370 but not all ids used
|
168
|
+
career_level: 8 } # 1..8
|
169
|
+
end
|
147
170
|
|
171
|
+
BINARY_ARRAY_SIZES = {
|
172
|
+
function: 8, # max id 255, currently 19
|
173
|
+
industry: 16, # max id 65535, currently 14370
|
174
|
+
career_level: 4 } # max id 15, currently 8
|
148
175
|
#
|
149
176
|
# creates the classification specific part of the feature vector
|
150
177
|
# @param ids [Hash] hash with classification ids
|
151
178
|
#
|
152
179
|
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
153
180
|
def classification_array(id)
|
154
|
-
|
181
|
+
case @classification_encoding
|
182
|
+
when :binary
|
183
|
+
number_to_binary_array(id, BINARY_ARRAY_SIZES[@classification])
|
184
|
+
else # :bitmap
|
185
|
+
Array.new(BITMAP_ARRAY_SIZES[@classification]){|n| n==(id-1) ? 1 : 0}
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def number_to_binary_array(number, size=8)
|
190
|
+
a=[]
|
191
|
+
(size-1).downto(0) do |i|
|
192
|
+
a<<number[i]
|
193
|
+
end
|
194
|
+
a
|
155
195
|
end
|
156
196
|
end
|
157
|
-
end
|
197
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
alors
|
2
|
+
au
|
3
|
+
aucuns
|
4
|
+
aussi
|
5
|
+
autre
|
6
|
+
avant
|
7
|
+
avec
|
8
|
+
avoir
|
9
|
+
bon
|
10
|
+
car
|
11
|
+
ce
|
12
|
+
cela
|
13
|
+
ces
|
14
|
+
ceux
|
15
|
+
chaque
|
16
|
+
ci
|
17
|
+
comme
|
18
|
+
comment
|
19
|
+
dans
|
20
|
+
des
|
21
|
+
du
|
22
|
+
dedans
|
23
|
+
dehors
|
24
|
+
depuis
|
25
|
+
deux
|
26
|
+
devrait
|
27
|
+
doit
|
28
|
+
donc
|
29
|
+
dos
|
30
|
+
droite
|
31
|
+
début
|
32
|
+
elle
|
33
|
+
elles
|
34
|
+
en
|
35
|
+
encore
|
36
|
+
essai
|
37
|
+
est
|
38
|
+
et
|
39
|
+
eu
|
40
|
+
fait
|
41
|
+
faites
|
42
|
+
fois
|
43
|
+
font
|
44
|
+
force
|
45
|
+
haut
|
46
|
+
hors
|
47
|
+
ici
|
48
|
+
il
|
49
|
+
ils
|
50
|
+
je juste
|
51
|
+
la
|
52
|
+
le
|
53
|
+
les
|
54
|
+
leur
|
55
|
+
là
|
56
|
+
ma
|
57
|
+
maintenant
|
58
|
+
mais
|
59
|
+
mes
|
60
|
+
mine
|
61
|
+
moins
|
62
|
+
mon
|
63
|
+
mot
|
64
|
+
même
|
65
|
+
ni
|
66
|
+
nommés
|
67
|
+
notre
|
68
|
+
nous
|
69
|
+
nouveaux
|
70
|
+
ou
|
71
|
+
où
|
72
|
+
par
|
73
|
+
parce
|
74
|
+
parole
|
75
|
+
pas
|
76
|
+
personnes
|
77
|
+
peut
|
78
|
+
peu
|
79
|
+
pièce
|
80
|
+
plupart
|
81
|
+
pour
|
82
|
+
pourquoi
|
83
|
+
quand
|
84
|
+
que
|
85
|
+
quel
|
86
|
+
quelle
|
87
|
+
quelles
|
88
|
+
quels
|
89
|
+
qui
|
90
|
+
sa
|
91
|
+
sans
|
92
|
+
ses
|
93
|
+
seulement
|
94
|
+
si
|
95
|
+
sien
|
96
|
+
son
|
97
|
+
sont
|
98
|
+
sous
|
99
|
+
soyez sujet
|
100
|
+
sur
|
101
|
+
ta
|
102
|
+
tandis
|
103
|
+
tellement
|
104
|
+
tels
|
105
|
+
tes
|
106
|
+
ton
|
107
|
+
tous
|
108
|
+
tout
|
109
|
+
trop
|
110
|
+
très
|
111
|
+
tu
|
112
|
+
valeur
|
113
|
+
voie
|
114
|
+
voient
|
115
|
+
vont
|
116
|
+
votre
|
117
|
+
vous
|
118
|
+
vu
|
119
|
+
ça
|
120
|
+
étaient
|
121
|
+
état
|
122
|
+
étions
|
123
|
+
été
|
124
|
+
être
|
data/lib/svm_helper/version.rb
CHANGED
data/spec/factories.rb
CHANGED
@@ -18,14 +18,15 @@ FactoryGirl.define do
|
|
18
18
|
|
19
19
|
|
20
20
|
factory :data, class: PreprocessedData do
|
21
|
-
data
|
21
|
+
data %w(haus fooo garten baaz pferd fooo)
|
22
22
|
id 7
|
23
23
|
label true
|
24
24
|
end
|
25
25
|
factory :data_w_short_words, parent: :data do
|
26
|
-
data
|
26
|
+
data %w(auto pferd gooo fooo)
|
27
|
+
label false
|
27
28
|
end
|
28
29
|
factory :data_w_multiple_sections, parent: :data do
|
29
|
-
data [
|
30
|
+
data [%w(meeh auto),%w(baaz fooo)]
|
30
31
|
end
|
31
32
|
end
|
@@ -13,9 +13,7 @@ shared_examples_for 'a selector' do
|
|
13
13
|
[0,1].should include(e)
|
14
14
|
end
|
15
15
|
end
|
16
|
-
it "should
|
17
|
-
selector.generate_vectors
|
18
|
-
e.should == selector.generate_vector(data)
|
19
|
-
end
|
16
|
+
it "should respond to generate_vectors" do
|
17
|
+
selector.should respond_to(:generate_vectors)
|
20
18
|
end
|
21
19
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
include ParallelHelper
|
5
|
+
describe ParallelHelper do
|
6
|
+
let(:data) { (1..20).to_a }
|
7
|
+
context "parallel map" do
|
8
|
+
it "should return as a normal map" do
|
9
|
+
p_map(data){|e| e**2 }.should == data.map{|e| e**2 }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
context "parallel map with index" do
|
13
|
+
it "should return as a normal map with index" do
|
14
|
+
p_map_with_index(data){|e,i| e*i }.should == data.map.with_index{|e,i| e*i }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
File without changes
|
@@ -41,6 +41,7 @@ describe Preprocessor::Simple do
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
|
44
45
|
context "#clean_title" do
|
45
46
|
it "should be downcased" do
|
46
47
|
job = FactoryGirl.build(:job_title_downcasing)
|
@@ -75,31 +76,41 @@ describe Preprocessor::Simple do
|
|
75
76
|
FactoryGirl.build(:job_description_w_code_token),
|
76
77
|
FactoryGirl.build(:job_description_w_gender) ]
|
77
78
|
}
|
79
|
+
it "should call strip_stopwords" do
|
80
|
+
simple.expects(:strip_stopwords)
|
81
|
+
simple.clean_description(jobs[0][:description])
|
82
|
+
end
|
78
83
|
it "should remove html/xml tags" do
|
79
|
-
desc = simple.clean_description(jobs[0][:description])
|
84
|
+
desc = simple.clean_description(jobs[0][:description]).join ' '
|
80
85
|
desc.should_not match(/<(.*?)>/)
|
81
86
|
end
|
82
87
|
it "should remove new lines" do
|
83
|
-
desc = simple.clean_description(jobs[0][:description])
|
88
|
+
desc = simple.clean_description(jobs[0][:description]).join ' '
|
84
89
|
desc.should_not match(/\r\n|\n|\r/)
|
85
90
|
end
|
86
91
|
it "should remove all special characters" do
|
87
|
-
desc = simple.clean_description(jobs[2][:description])
|
92
|
+
desc = simple.clean_description(jobs[2][:description]).join ' '
|
88
93
|
desc.should_not match(/[^a-z öäü]/i)
|
89
94
|
end
|
90
95
|
it "should remove gender tokens" do
|
91
|
-
desc = simple.clean_description(jobs[3][:description])
|
96
|
+
desc = simple.clean_description(jobs[3][:description]).join ' '
|
92
97
|
desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
|
93
98
|
end
|
94
99
|
it "should remove job code token" do
|
95
|
-
desc = simple.clean_description(jobs[4][:description])
|
100
|
+
desc = simple.clean_description(jobs[4][:description]).join ' '
|
96
101
|
desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
|
97
102
|
end
|
98
103
|
it "should be downcased" do
|
99
|
-
desc = simple.clean_description(jobs[2][:description])
|
104
|
+
desc = simple.clean_description(jobs[2][:description]).join ' '
|
100
105
|
desc.should_not match(/[^a-z öäü]/)
|
101
106
|
end
|
102
107
|
end
|
108
|
+
|
109
|
+
context "strip_stopwords" do
|
110
|
+
it "should remove words like 'and' from the text" do
|
111
|
+
simple.strip_stopwords("Dogs and cats").should == %w(Dogs cats)
|
112
|
+
end
|
113
|
+
end
|
103
114
|
context "parallel" do
|
104
115
|
let(:parallel) { Preprocessor::Simple.new(parallel: true) }
|
105
116
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Preprocessor::Stemming do
|
4
|
+
it_behaves_like 'a preprocessor'
|
5
|
+
let(:preprocessor) { Preprocessor::Stemming.new }
|
6
|
+
let(:job) { FactoryGirl.build(:job) }
|
7
|
+
let(:jobs) { [job] }
|
8
|
+
it "should reduce words to their stem" do
|
9
|
+
preprocessor.clean_description("developer engineering").should == %w(develop engin)
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Selector::BiNormalSeperation do
|
4
|
+
it_behaves_like 'a selector'
|
5
|
+
|
6
|
+
let(:bns) { Selector::BiNormalSeperation.new(:function) }
|
7
|
+
context "#extract_words_from_data" do
|
8
|
+
it "should generate a list of words from the data" do
|
9
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data))
|
10
|
+
words.should have(10).things
|
11
|
+
end
|
12
|
+
it "should remove words with 3 characters or less" do
|
13
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data_w_short_words))
|
14
|
+
words.should have(6).things
|
15
|
+
end
|
16
|
+
it "should process multiple sections in the data" do
|
17
|
+
words = bns.extract_words_from_data(FactoryGirl.build(:data_w_multiple_sections))
|
18
|
+
words.should have(6).things
|
19
|
+
end
|
20
|
+
end
|
21
|
+
context "#generate_global_dictionary" do
|
22
|
+
let(:data) { [FactoryGirl.build_list(:data,1),
|
23
|
+
FactoryGirl.build_list(:data_w_short_words,4),
|
24
|
+
FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
|
25
|
+
let(:words_per_data) { bns.extract_words(data,true) }
|
26
|
+
it "should return a list of n words" do
|
27
|
+
bns.generate_global_dictionary(words_per_data,2)
|
28
|
+
bns.global_dictionary.should have(2).things
|
29
|
+
end
|
30
|
+
it "should return a list of the n most used words in the data array" do
|
31
|
+
bns.generate_global_dictionary(words_per_data,3)
|
32
|
+
bns.global_dictionary.should eq(%w(fooo auto pferd))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
# just some very basic test to make sure these functions do not fail
|
4
|
+
describe "Calc" do
|
5
|
+
include Selector::IG
|
6
|
+
include Selector::BNS
|
7
|
+
let(:test_data){ [
|
8
|
+
[34, 23, 28, 17],
|
9
|
+
[31, 17, 23, 12],
|
10
|
+
[44, 39, 41, 36],
|
11
|
+
[44, 23, 41, 23],
|
12
|
+
[44, 39, 0, 36],
|
13
|
+
[44, 39, 41, 0],
|
14
|
+
[62, 81, 15, 73]
|
15
|
+
]}
|
16
|
+
|
17
|
+
context Selector::IG do
|
18
|
+
it "should not fail" do
|
19
|
+
test_data.each do |data|
|
20
|
+
->{information_gain(*data)}.should_not raise_error
|
21
|
+
end
|
22
|
+
end
|
23
|
+
it "should return some values" do
|
24
|
+
test_data.each do |data|
|
25
|
+
information_gain(*data).should be_a(Numeric)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context Selector::BNS do
|
31
|
+
it "should not fail" do
|
32
|
+
test_data.each do |data|
|
33
|
+
->{bi_normal_seperation(*data)}.should_not raise_error
|
34
|
+
end
|
35
|
+
end
|
36
|
+
it "should return some values" do
|
37
|
+
test_data.each do |data|
|
38
|
+
bi_normal_seperation(*data).should be_a(Numeric)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -1,9 +1,7 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
4
|
-
|
5
|
-
|
6
|
-
let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
|
3
|
+
describe "n-grams" do
|
4
|
+
let(:ngram) { Selector::Simple.new(:function, word_selection: :grams, gram_size: 3) }
|
7
5
|
context "#extract_words_from_data" do
|
8
6
|
it "should generate a list of words from the data" do
|
9
7
|
words = ngram.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -7,9 +7,6 @@ describe Selector::Simple do
|
|
7
7
|
it "should have select_feature_vector implemented" do
|
8
8
|
expect { simple.generate_vectors([]) }.to_not raise_error
|
9
9
|
end
|
10
|
-
context "#stopwords" do
|
11
|
-
it "simply loads them from a file"
|
12
|
-
end
|
13
10
|
context "#extract_words_from_data" do
|
14
11
|
it "should generate a list of words from the data" do
|
15
12
|
words = simple.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -50,6 +47,19 @@ describe Selector::Simple do
|
|
50
47
|
simple.global_dictionary.should eq(%w(fooo auto baaz))
|
51
48
|
end
|
52
49
|
end
|
50
|
+
context "#build_dictionary" do
|
51
|
+
let(:data) { [FactoryGirl.build_list(:data,1),
|
52
|
+
FactoryGirl.build_list(:data_w_short_words,2),
|
53
|
+
FactoryGirl.build_list(:data_w_multiple_sections,3)].flatten }
|
54
|
+
it "should return a list of n words" do
|
55
|
+
simple.build_dictionary(data,2)
|
56
|
+
simple.global_dictionary.should have(2).things
|
57
|
+
end
|
58
|
+
it "should return a list of the n most used words in the data array" do
|
59
|
+
simple.build_dictionary(data,3)
|
60
|
+
simple.global_dictionary.should eq(%w(fooo auto baaz))
|
61
|
+
end
|
62
|
+
end
|
53
63
|
context "#generate_vector" do
|
54
64
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
55
65
|
let(:data) { FactoryGirl.build(:data) }
|
@@ -109,6 +119,7 @@ describe Selector::Simple do
|
|
109
119
|
context "parallel" do
|
110
120
|
let(:parallel) { Selector::Simple.new(:function, parallel: true) }
|
111
121
|
before(:each) do
|
122
|
+
require 'parallel'
|
112
123
|
simple.stubs(:global_dictionary).returns(dictionary)
|
113
124
|
parallel.stubs(:global_dictionary).returns(dictionary)
|
114
125
|
end
|
@@ -119,4 +130,4 @@ describe Selector::Simple do
|
|
119
130
|
end
|
120
131
|
end
|
121
132
|
end
|
122
|
-
end
|
133
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
4
|
-
|
5
|
-
let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
|
3
|
+
describe "binary encoded classification" do
|
4
|
+
let(:simple) { Selector::Simple.new(:career_level, classification_encoding: :binary) }
|
6
5
|
|
7
6
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
8
7
|
let(:data) { FactoryGirl.build(:data) }
|
data/svm_helper.gemspec
CHANGED
metadata
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svm_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.1
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Andreas Eger
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-04-25 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
14
|
+
name: ruby-stemmer
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0
|
19
|
+
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: 0
|
26
|
+
version: '0'
|
30
27
|
description: Shared helper classes for usage in context of SVM at experteer
|
31
28
|
email:
|
32
29
|
- dev@eger-andreas.de
|
@@ -48,16 +45,21 @@ files:
|
|
48
45
|
- lib/svm_helper.rb
|
49
46
|
- lib/svm_helper/feature_vector.rb
|
50
47
|
- lib/svm_helper/interface_helper.rb
|
48
|
+
- lib/svm_helper/parallel_helper.rb
|
51
49
|
- lib/svm_helper/preprocessed_data.rb
|
52
50
|
- lib/svm_helper/preprocessors.rb
|
53
51
|
- lib/svm_helper/preprocessors/id_mapping.rb
|
54
52
|
- lib/svm_helper/preprocessors/simple.rb
|
53
|
+
- lib/svm_helper/preprocessors/stemming.rb
|
55
54
|
- lib/svm_helper/selectors.rb
|
56
|
-
- lib/svm_helper/selectors/
|
55
|
+
- lib/svm_helper/selectors/bi_normal_seperation.rb
|
56
|
+
- lib/svm_helper/selectors/bns_ig.rb
|
57
|
+
- lib/svm_helper/selectors/calc.rb
|
58
|
+
- lib/svm_helper/selectors/information_gain.rb
|
57
59
|
- lib/svm_helper/selectors/simple.rb
|
58
|
-
- lib/svm_helper/selectors/with_binary_encoding.rb
|
59
60
|
- lib/svm_helper/stopwords/de
|
60
61
|
- lib/svm_helper/stopwords/en
|
62
|
+
- lib/svm_helper/stopwords/fr
|
61
63
|
- lib/svm_helper/version.rb
|
62
64
|
- spec/factories.rb
|
63
65
|
- spec/factories/jobs/tmp.html
|
@@ -65,44 +67,43 @@ files:
|
|
65
67
|
- spec/factories/jobs/tmp3.html
|
66
68
|
- spec/factories/jobs_with_description.rb
|
67
69
|
- spec/factories/jobs_with_title.rb
|
68
|
-
- spec/preprocessors/id_mapping_spec.rb
|
69
|
-
- spec/preprocessors/simple_spec.rb
|
70
|
-
- spec/selectors/n_gram_spec.rb
|
71
|
-
- spec/selectors/simple_spec.rb
|
72
|
-
- spec/selectors/with_binary_encoding_spec.rb
|
73
70
|
- spec/spec_helper.rb
|
74
71
|
- spec/support/preprocessor_spec.rb
|
75
72
|
- spec/support/selector_spec.rb
|
73
|
+
- spec/svm_helper/parallel_helper_spec.rb
|
74
|
+
- spec/svm_helper/preprocessors/id_mapping_spec.rb
|
75
|
+
- spec/svm_helper/preprocessors/simple_spec.rb
|
76
|
+
- spec/svm_helper/preprocessors/stemming_spec.rb
|
77
|
+
- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
|
78
|
+
- spec/svm_helper/selectors/bns_ig_spec.rb
|
79
|
+
- spec/svm_helper/selectors/calc_spec.rb
|
80
|
+
- spec/svm_helper/selectors/information_gain_spec.rb
|
81
|
+
- spec/svm_helper/selectors/n_gram_spec.rb
|
82
|
+
- spec/svm_helper/selectors/simple_spec.rb
|
83
|
+
- spec/svm_helper/selectors/with_binary_encoding_spec.rb
|
76
84
|
- svm_helper.gemspec
|
77
85
|
homepage: https://github.com/sch1zo/svm_helper
|
78
86
|
licenses: []
|
87
|
+
metadata: {}
|
79
88
|
post_install_message:
|
80
89
|
rdoc_options: []
|
81
90
|
require_paths:
|
82
91
|
- lib
|
83
92
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
-
none: false
|
85
93
|
requirements:
|
86
94
|
- - '>='
|
87
95
|
- !ruby/object:Gem::Version
|
88
96
|
version: '0'
|
89
|
-
segments:
|
90
|
-
- 0
|
91
|
-
hash: 2037039748537332986
|
92
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
-
none: false
|
94
98
|
requirements:
|
95
99
|
- - '>='
|
96
100
|
- !ruby/object:Gem::Version
|
97
101
|
version: '0'
|
98
|
-
segments:
|
99
|
-
- 0
|
100
|
-
hash: 2037039748537332986
|
101
102
|
requirements: []
|
102
103
|
rubyforge_project:
|
103
|
-
rubygems_version:
|
104
|
+
rubygems_version: 2.0.0.rc.2
|
104
105
|
signing_key:
|
105
|
-
specification_version:
|
106
|
+
specification_version: 4
|
106
107
|
summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
|
107
108
|
test_files:
|
108
109
|
- spec/factories.rb
|
@@ -111,12 +112,18 @@ test_files:
|
|
111
112
|
- spec/factories/jobs/tmp3.html
|
112
113
|
- spec/factories/jobs_with_description.rb
|
113
114
|
- spec/factories/jobs_with_title.rb
|
114
|
-
- spec/preprocessors/id_mapping_spec.rb
|
115
|
-
- spec/preprocessors/simple_spec.rb
|
116
|
-
- spec/selectors/n_gram_spec.rb
|
117
|
-
- spec/selectors/simple_spec.rb
|
118
|
-
- spec/selectors/with_binary_encoding_spec.rb
|
119
115
|
- spec/spec_helper.rb
|
120
116
|
- spec/support/preprocessor_spec.rb
|
121
117
|
- spec/support/selector_spec.rb
|
118
|
+
- spec/svm_helper/parallel_helper_spec.rb
|
119
|
+
- spec/svm_helper/preprocessors/id_mapping_spec.rb
|
120
|
+
- spec/svm_helper/preprocessors/simple_spec.rb
|
121
|
+
- spec/svm_helper/preprocessors/stemming_spec.rb
|
122
|
+
- spec/svm_helper/selectors/bi_normal_seperation_spec.rb
|
123
|
+
- spec/svm_helper/selectors/bns_ig_spec.rb
|
124
|
+
- spec/svm_helper/selectors/calc_spec.rb
|
125
|
+
- spec/svm_helper/selectors/information_gain_spec.rb
|
126
|
+
- spec/svm_helper/selectors/n_gram_spec.rb
|
127
|
+
- spec/svm_helper/selectors/simple_spec.rb
|
128
|
+
- spec/svm_helper/selectors/with_binary_encoding_spec.rb
|
122
129
|
has_rdoc:
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Selector
|
3
|
-
#
|
4
|
-
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class NGram < Selector::Simple
|
9
|
-
attr_reader :gram_size
|
10
|
-
|
11
|
-
def initialize classification, args={}
|
12
|
-
super
|
13
|
-
@gram_size = args.fetch(:gram_size) { 2 }
|
14
|
-
end
|
15
|
-
|
16
|
-
def label
|
17
|
-
"ngram"
|
18
|
-
end
|
19
|
-
#
|
20
|
-
# fetches all words snippets from one data entry, removes stopwords and very short words
|
21
|
-
# @param data [PreprocessedData]
|
22
|
-
# @param gram_size [Integer] gram size
|
23
|
-
#
|
24
|
-
# @return [Array<String>]
|
25
|
-
def extract_words_from_data data, gram_size=@gram_size
|
26
|
-
(data.data.flat_map(&:split) - stopwords)
|
27
|
-
.delete_if { |e| e.size <= 3 }
|
28
|
-
.each_cons(gram_size).map{|e| e.join " " }
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Selector
|
3
|
-
#
|
4
|
-
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class WithBinaryEncoding < Selector::Simple
|
9
|
-
|
10
|
-
CLASSIFICATIONS_SIZE = {
|
11
|
-
function: 8, # max id 255, currently 19
|
12
|
-
industry: 16, # max id 65535, currently 14370
|
13
|
-
career_level: 4 } # max id 15, currently 8
|
14
|
-
|
15
|
-
def initialize *args
|
16
|
-
super
|
17
|
-
end
|
18
|
-
|
19
|
-
def label
|
20
|
-
"simple-WithBinaryEncoding"
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
#
|
25
|
-
# creates the classification specific part of the feature vector
|
26
|
-
# @param ids [Hash] hash with classification ids
|
27
|
-
#
|
28
|
-
# @return [Array<Integer>] binary encoded classification id
|
29
|
-
def classification_array(id)
|
30
|
-
number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
|
31
|
-
end
|
32
|
-
|
33
|
-
def number_to_binary_array(number, size=8)
|
34
|
-
a=[]
|
35
|
-
(size-1).downto(0) do |i|
|
36
|
-
a<<number[i]
|
37
|
-
end
|
38
|
-
a
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|