svm_helper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/.versions.conf +4 -0
- data/.yardopts +3 -0
- data/Gemfile +24 -0
- data/Guardfile +17 -0
- data/LICENSE.txt +22 -0
- data/README.md +41 -0
- data/Rakefile +7 -0
- data/lib/svm_helper.rb +8 -0
- data/lib/svm_helper/feature_vector.rb +17 -0
- data/lib/svm_helper/interface_helper.rb +57 -0
- data/lib/svm_helper/preprocessed_data.rb +17 -0
- data/lib/svm_helper/preprocessors.rb +2 -0
- data/lib/svm_helper/preprocessors/simple.rb +111 -0
- data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
- data/lib/svm_helper/selectors.rb +3 -0
- data/lib/svm_helper/selectors/n_gram.rb +31 -0
- data/lib/svm_helper/selectors/simple.rb +163 -0
- data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
- data/lib/svm_helper/stopwords/de +127 -0
- data/lib/svm_helper/stopwords/en +119 -0
- data/lib/svm_helper/version.rb +3 -0
- data/spec/factories.rb +35 -0
- data/spec/factories/jobs/tmp.html +42 -0
- data/spec/factories/jobs/tmp2.html +20 -0
- data/spec/factories/jobs/tmp3.html +34 -0
- data/spec/factories/jobs_with_description.rb +20 -0
- data/spec/factories/jobs_with_title.rb +72 -0
- data/spec/preprocessors/simple_spec.rb +138 -0
- data/spec/preprocessors/with_industry_map_spec.rb +16 -0
- data/spec/selectors/n_gram_spec.rb +21 -0
- data/spec/selectors/simple_spec.rb +121 -0
- data/spec/selectors/with_binary_encoding_spec.rb +39 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/preprocessor_spec.rb +21 -0
- data/spec/support/selector_spec.rb +21 -0
- data/svm_helper.gemspec +21 -0
- metadata +112 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class NGram < Selector::Simple
|
9
|
+
attr_reader :gram_size
|
10
|
+
|
11
|
+
def initialize args={}
|
12
|
+
super
|
13
|
+
@gram_size = args.fetch(:gram_size) { 2 }
|
14
|
+
end
|
15
|
+
|
16
|
+
def label
|
17
|
+
"ngram"
|
18
|
+
end
|
19
|
+
#
|
20
|
+
# fetches all words snippets from one data entry, removes stopwords and very short words
|
21
|
+
# @param data [PreprocessedData]
|
22
|
+
# @param gram_size [Integer] gram size
|
23
|
+
#
|
24
|
+
# @return [Array<String>]
|
25
|
+
def extract_words_from_data data, gram_size=@gram_size
|
26
|
+
(data.data.flat_map(&:split) - stopwords)
|
27
|
+
.delete_if { |e| e.size <= 3 }
|
28
|
+
.each_cons(gram_size).map{|e| e.join " " }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
module Selector
|
2
|
+
#
|
3
|
+
# Selector which uses a simple dictionary to generate feature vectors
|
4
|
+
#
|
5
|
+
# @author Andreas Eger
|
6
|
+
#
|
7
|
+
class Simple
|
8
|
+
THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
|
9
|
+
# stopword file
|
10
|
+
#TODO use File.expand_path
|
11
|
+
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
12
|
+
# default dictionary size
|
13
|
+
DEFAULT_DICTIONARY_SIZE = 800
|
14
|
+
|
15
|
+
CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
|
16
|
+
{ function: Pjpp::Function.count,
|
17
|
+
industry: Pjpp::Industry.count,
|
18
|
+
career_level: Pjpp::CareerLevel.count }
|
19
|
+
else
|
20
|
+
{ function: 19, # 1..19
|
21
|
+
industry: 632, # 1..14370 but not all ids used
|
22
|
+
career_level: 8 } # 1..8
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
attr_accessor :global_dictionary
|
28
|
+
|
29
|
+
def initialize args={}
|
30
|
+
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
31
|
+
@language = args.fetch(:language){'en'}
|
32
|
+
@parallel = args.fetch(:parallel){false}
|
33
|
+
end
|
34
|
+
|
35
|
+
def label
|
36
|
+
"simple"
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# generates a list of feature vetors and their labels from preprocessed data
|
41
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
42
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
43
|
+
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
44
|
+
#
|
45
|
+
# @return [Array<FeatureVector>] list of feature vectors and labels
|
46
|
+
def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
47
|
+
words_per_data = extract_words data_set
|
48
|
+
generate_global_dictionary words_per_data, dictionary_size
|
49
|
+
|
50
|
+
make_vectors(words_per_data) do |words,index|
|
51
|
+
word_set = words.uniq
|
52
|
+
make_vector word_set, data_set[index], classification
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# generates a feature vector with its label
|
58
|
+
# @param data [PreprocessedData]
|
59
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
60
|
+
# @param dictionary [Array] dictionary to use for this selection
|
61
|
+
#
|
62
|
+
# @return [FeatureVector]
|
63
|
+
def generate_vector data, classification=:function, dictionary=global_dictionary
|
64
|
+
word_set = Set.new extract_words_from_data(data)
|
65
|
+
make_vector word_set, data, classification, dictionary
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# loads a txt file with stop words
|
70
|
+
# @param location String folder with stopword lists
|
71
|
+
#
|
72
|
+
# @return [Array<String>] Array of stopwords
|
73
|
+
def stopwords(location=STOPWORD_LOCATION)
|
74
|
+
@stopwords ||= IO.read(File.join(location,@language)).split
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# generates a list of words used as dictionary
|
79
|
+
# @param all_words (see #extract_words)
|
80
|
+
# @param size dictionary size
|
81
|
+
#
|
82
|
+
# @return [Array<String>] list of words
|
83
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
84
|
+
return unless global_dictionary.empty?
|
85
|
+
|
86
|
+
words = all_words.flatten.group_by{|e| e}.values
|
87
|
+
.sort_by{|e| e.size}
|
88
|
+
.map{|e| [e[0],e.size]}
|
89
|
+
@global_dictionary = words.last(size).map(&:first).reverse
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# extracts the words of all provided data entries
|
94
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
95
|
+
#
|
96
|
+
# @return [Array<Array<String>>] list of words per data entry
|
97
|
+
def extract_words data_set
|
98
|
+
data_set.map do |data|
|
99
|
+
extract_words_from_data data
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# fetches all words from one data entry, removes stopwords and very short words
|
105
|
+
# @param data [PreprocessedData] preprocessed data entry
|
106
|
+
#
|
107
|
+
# @return [Array<String>] list of words
|
108
|
+
def extract_words_from_data data
|
109
|
+
(data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
|
110
|
+
end
|
111
|
+
|
112
|
+
def reset
|
113
|
+
@global_dictionary = []
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
#
|
119
|
+
# creates a feature vector for the given words, classification and dictionary
|
120
|
+
# also adds the label
|
121
|
+
# @param words [Array<String>] list of words
|
122
|
+
# @param data [PreprocessedData]
|
123
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
124
|
+
# @param dictionary
|
125
|
+
#
|
126
|
+
# @return [FeatureVector]
|
127
|
+
def make_vector words, data, classification, dictionary=global_dictionary
|
128
|
+
FeatureVector.new(
|
129
|
+
word_data: dictionary.map{|dic_word|
|
130
|
+
words.include?(dic_word) ? 1 : 0
|
131
|
+
},
|
132
|
+
classification_arrays: {
|
133
|
+
function: classification_array(data.ids, :function),
|
134
|
+
industry: classification_array(data.ids, :industry),
|
135
|
+
career_level: classification_array(data.ids, :career_level) },
|
136
|
+
labels: {
|
137
|
+
function: data.labels[:function] ? 1 : 0,
|
138
|
+
industry: data.labels[:industry] ? 1 : 0,
|
139
|
+
career_level: data.labels[:career_level] ? 1 : 0 }
|
140
|
+
).tap{|e| e.send("#{classification}!")}
|
141
|
+
end
|
142
|
+
|
143
|
+
def make_vectors data, &block
|
144
|
+
if @parallel && RUBY_PLATFORM == 'java'
|
145
|
+
Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
|
146
|
+
elsif @parallel
|
147
|
+
Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
|
148
|
+
else
|
149
|
+
data.map.with_index {|e,i| yield e,i }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# creates the classification specific part of the feature vector
|
155
|
+
# @param ids [Hash] hash with classification ids
|
156
|
+
#
|
157
|
+
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
158
|
+
def classification_array(ids, classification)
|
159
|
+
id = ids[classification]
|
160
|
+
Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class WithBinaryEncoding < Selector::Simple
|
9
|
+
|
10
|
+
CLASSIFICATIONS_SIZE = {
|
11
|
+
function: 8, # max id 255, currently 19
|
12
|
+
industry: 16, # max id 65535, currently 14370
|
13
|
+
career_level: 4 } # max id 15, currently 8
|
14
|
+
|
15
|
+
def initialize args={}
|
16
|
+
super
|
17
|
+
end
|
18
|
+
|
19
|
+
def label
|
20
|
+
"simple-WithBinaryEncoding"
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
#
|
25
|
+
# creates the classification specific part of the feature vector
|
26
|
+
# @param ids [Hash] hash with classification ids
|
27
|
+
#
|
28
|
+
# @return [Array<Integer>] binary encoded classification id
|
29
|
+
def classification_array(ids, classification)
|
30
|
+
id = ids[classification]
|
31
|
+
number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
|
32
|
+
end
|
33
|
+
|
34
|
+
def number_to_binary_array(number, size=8)
|
35
|
+
a=[]
|
36
|
+
(size-1).downto(0) do |i|
|
37
|
+
a<<number[i]
|
38
|
+
end
|
39
|
+
a
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
aber
|
2
|
+
als
|
3
|
+
am
|
4
|
+
an
|
5
|
+
auch
|
6
|
+
auf
|
7
|
+
aus
|
8
|
+
bei
|
9
|
+
bin
|
10
|
+
bis
|
11
|
+
bist
|
12
|
+
da
|
13
|
+
dadurch
|
14
|
+
daher
|
15
|
+
darum
|
16
|
+
das
|
17
|
+
daß
|
18
|
+
dass
|
19
|
+
dein
|
20
|
+
deine
|
21
|
+
dem
|
22
|
+
den
|
23
|
+
der
|
24
|
+
des
|
25
|
+
dessen
|
26
|
+
deshalb
|
27
|
+
die
|
28
|
+
dies
|
29
|
+
dieser
|
30
|
+
dieses
|
31
|
+
doch
|
32
|
+
dort
|
33
|
+
du
|
34
|
+
durch
|
35
|
+
ein
|
36
|
+
eine
|
37
|
+
einem
|
38
|
+
einen
|
39
|
+
einer
|
40
|
+
eines
|
41
|
+
er
|
42
|
+
es
|
43
|
+
euer
|
44
|
+
eure
|
45
|
+
für
|
46
|
+
hatte
|
47
|
+
hatten
|
48
|
+
hattest
|
49
|
+
hattet
|
50
|
+
hier hinter
|
51
|
+
ich
|
52
|
+
ihr
|
53
|
+
ihre
|
54
|
+
im
|
55
|
+
in
|
56
|
+
ist
|
57
|
+
ja
|
58
|
+
jede
|
59
|
+
jedem
|
60
|
+
jeden
|
61
|
+
jeder
|
62
|
+
jedes
|
63
|
+
jener
|
64
|
+
jenes
|
65
|
+
jetzt
|
66
|
+
kann
|
67
|
+
kannst
|
68
|
+
können
|
69
|
+
könnt
|
70
|
+
machen
|
71
|
+
mein
|
72
|
+
meine
|
73
|
+
mit
|
74
|
+
muß
|
75
|
+
mußt
|
76
|
+
musst
|
77
|
+
müssen
|
78
|
+
müßt
|
79
|
+
nach
|
80
|
+
nachdem
|
81
|
+
nein
|
82
|
+
nicht
|
83
|
+
nun
|
84
|
+
oder
|
85
|
+
seid
|
86
|
+
sein
|
87
|
+
seine
|
88
|
+
sich
|
89
|
+
sie
|
90
|
+
sind
|
91
|
+
soll
|
92
|
+
sollen
|
93
|
+
sollst
|
94
|
+
sollt
|
95
|
+
sonst
|
96
|
+
soweit
|
97
|
+
sowie
|
98
|
+
und
|
99
|
+
unser unsere
|
100
|
+
unter
|
101
|
+
vom
|
102
|
+
von
|
103
|
+
vor
|
104
|
+
wann
|
105
|
+
warum
|
106
|
+
was
|
107
|
+
weiter
|
108
|
+
weitere
|
109
|
+
wenn
|
110
|
+
wer
|
111
|
+
werde
|
112
|
+
werden
|
113
|
+
werdet
|
114
|
+
weshalb
|
115
|
+
wie
|
116
|
+
wieder
|
117
|
+
wieso
|
118
|
+
wir
|
119
|
+
wird
|
120
|
+
wirst
|
121
|
+
wo
|
122
|
+
woher
|
123
|
+
wohin
|
124
|
+
zu
|
125
|
+
zum
|
126
|
+
zur
|
127
|
+
über
|
@@ -0,0 +1,119 @@
|
|
1
|
+
a
|
2
|
+
able
|
3
|
+
about
|
4
|
+
across
|
5
|
+
after
|
6
|
+
all
|
7
|
+
almost
|
8
|
+
also
|
9
|
+
am
|
10
|
+
among
|
11
|
+
an
|
12
|
+
and
|
13
|
+
any
|
14
|
+
are
|
15
|
+
as
|
16
|
+
at
|
17
|
+
be
|
18
|
+
because
|
19
|
+
been
|
20
|
+
but
|
21
|
+
by
|
22
|
+
can
|
23
|
+
cannot
|
24
|
+
could
|
25
|
+
dear
|
26
|
+
did
|
27
|
+
do
|
28
|
+
does
|
29
|
+
either
|
30
|
+
else
|
31
|
+
ever
|
32
|
+
every
|
33
|
+
for
|
34
|
+
from
|
35
|
+
get
|
36
|
+
got
|
37
|
+
had
|
38
|
+
has
|
39
|
+
have
|
40
|
+
he
|
41
|
+
her
|
42
|
+
hers
|
43
|
+
him
|
44
|
+
his
|
45
|
+
how
|
46
|
+
however
|
47
|
+
i
|
48
|
+
if
|
49
|
+
in
|
50
|
+
into
|
51
|
+
is
|
52
|
+
it
|
53
|
+
its
|
54
|
+
just
|
55
|
+
least
|
56
|
+
let
|
57
|
+
like
|
58
|
+
likely
|
59
|
+
may
|
60
|
+
me
|
61
|
+
might
|
62
|
+
most
|
63
|
+
must
|
64
|
+
my
|
65
|
+
neither
|
66
|
+
no
|
67
|
+
nor
|
68
|
+
not
|
69
|
+
of
|
70
|
+
off
|
71
|
+
often
|
72
|
+
on
|
73
|
+
only
|
74
|
+
or
|
75
|
+
other
|
76
|
+
our
|
77
|
+
own
|
78
|
+
rather
|
79
|
+
said
|
80
|
+
say
|
81
|
+
says
|
82
|
+
she
|
83
|
+
should
|
84
|
+
since
|
85
|
+
so
|
86
|
+
some
|
87
|
+
than
|
88
|
+
that
|
89
|
+
the
|
90
|
+
their
|
91
|
+
them
|
92
|
+
then
|
93
|
+
there
|
94
|
+
these
|
95
|
+
they
|
96
|
+
this
|
97
|
+
tis
|
98
|
+
to
|
99
|
+
too
|
100
|
+
twas
|
101
|
+
us
|
102
|
+
wants
|
103
|
+
was
|
104
|
+
we
|
105
|
+
were
|
106
|
+
what
|
107
|
+
when
|
108
|
+
where
|
109
|
+
which
|
110
|
+
while
|
111
|
+
who
|
112
|
+
whom
|
113
|
+
why
|
114
|
+
will
|
115
|
+
with
|
116
|
+
would
|
117
|
+
yet
|
118
|
+
you
|
119
|
+
your
|