svm_helper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/.versions.conf +4 -0
- data/.yardopts +3 -0
- data/Gemfile +24 -0
- data/Guardfile +17 -0
- data/LICENSE.txt +22 -0
- data/README.md +41 -0
- data/Rakefile +7 -0
- data/lib/svm_helper.rb +8 -0
- data/lib/svm_helper/feature_vector.rb +17 -0
- data/lib/svm_helper/interface_helper.rb +57 -0
- data/lib/svm_helper/preprocessed_data.rb +17 -0
- data/lib/svm_helper/preprocessors.rb +2 -0
- data/lib/svm_helper/preprocessors/simple.rb +111 -0
- data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
- data/lib/svm_helper/selectors.rb +3 -0
- data/lib/svm_helper/selectors/n_gram.rb +31 -0
- data/lib/svm_helper/selectors/simple.rb +163 -0
- data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
- data/lib/svm_helper/stopwords/de +127 -0
- data/lib/svm_helper/stopwords/en +119 -0
- data/lib/svm_helper/version.rb +3 -0
- data/spec/factories.rb +35 -0
- data/spec/factories/jobs/tmp.html +42 -0
- data/spec/factories/jobs/tmp2.html +20 -0
- data/spec/factories/jobs/tmp3.html +34 -0
- data/spec/factories/jobs_with_description.rb +20 -0
- data/spec/factories/jobs_with_title.rb +72 -0
- data/spec/preprocessors/simple_spec.rb +138 -0
- data/spec/preprocessors/with_industry_map_spec.rb +16 -0
- data/spec/selectors/n_gram_spec.rb +21 -0
- data/spec/selectors/simple_spec.rb +121 -0
- data/spec/selectors/with_binary_encoding_spec.rb +39 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/preprocessor_spec.rb +21 -0
- data/spec/support/selector_spec.rb +21 -0
- data/svm_helper.gemspec +21 -0
- metadata +112 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class NGram < Selector::Simple
|
9
|
+
attr_reader :gram_size
|
10
|
+
|
11
|
+
def initialize args={}
|
12
|
+
super
|
13
|
+
@gram_size = args.fetch(:gram_size) { 2 }
|
14
|
+
end
|
15
|
+
|
16
|
+
def label
|
17
|
+
"ngram"
|
18
|
+
end
|
19
|
+
#
|
20
|
+
# fetches all words snippets from one data entry, removes stopwords and very short words
|
21
|
+
# @param data [PreprocessedData]
|
22
|
+
# @param gram_size [Integer] gram size
|
23
|
+
#
|
24
|
+
# @return [Array<String>]
|
25
|
+
def extract_words_from_data data, gram_size=@gram_size
|
26
|
+
(data.data.flat_map(&:split) - stopwords)
|
27
|
+
.delete_if { |e| e.size <= 3 }
|
28
|
+
.each_cons(gram_size).map{|e| e.join " " }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
module Selector
|
2
|
+
#
|
3
|
+
# Selector which uses a simple dictionary to generate feature vectors
|
4
|
+
#
|
5
|
+
# @author Andreas Eger
|
6
|
+
#
|
7
|
+
class Simple
|
8
|
+
THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
|
9
|
+
# stopword file
|
10
|
+
#TODO use File.expand_path
|
11
|
+
STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
|
12
|
+
# default dictionary size
|
13
|
+
DEFAULT_DICTIONARY_SIZE = 800
|
14
|
+
|
15
|
+
CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
|
16
|
+
{ function: Pjpp::Function.count,
|
17
|
+
industry: Pjpp::Industry.count,
|
18
|
+
career_level: Pjpp::CareerLevel.count }
|
19
|
+
else
|
20
|
+
{ function: 19, # 1..19
|
21
|
+
industry: 632, # 1..14370 but not all ids used
|
22
|
+
career_level: 8 } # 1..8
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
attr_accessor :global_dictionary
|
28
|
+
|
29
|
+
def initialize args={}
|
30
|
+
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
31
|
+
@language = args.fetch(:language){'en'}
|
32
|
+
@parallel = args.fetch(:parallel){false}
|
33
|
+
end
|
34
|
+
|
35
|
+
def label
|
36
|
+
"simple"
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# generates a list of feature vetors and their labels from preprocessed data
|
41
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
42
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
43
|
+
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
44
|
+
#
|
45
|
+
# @return [Array<FeatureVector>] list of feature vectors and labels
|
46
|
+
def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
47
|
+
words_per_data = extract_words data_set
|
48
|
+
generate_global_dictionary words_per_data, dictionary_size
|
49
|
+
|
50
|
+
make_vectors(words_per_data) do |words,index|
|
51
|
+
word_set = words.uniq
|
52
|
+
make_vector word_set, data_set[index], classification
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# generates a feature vector with its label
|
58
|
+
# @param data [PreprocessedData]
|
59
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
60
|
+
# @param dictionary [Array] dictionary to use for this selection
|
61
|
+
#
|
62
|
+
# @return [FeatureVector]
|
63
|
+
def generate_vector data, classification=:function, dictionary=global_dictionary
|
64
|
+
word_set = Set.new extract_words_from_data(data)
|
65
|
+
make_vector word_set, data, classification, dictionary
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# loads a txt file with stop words
|
70
|
+
# @param location String folder with stopword lists
|
71
|
+
#
|
72
|
+
# @return [Array<String>] Array of stopwords
|
73
|
+
def stopwords(location=STOPWORD_LOCATION)
|
74
|
+
@stopwords ||= IO.read(File.join(location,@language)).split
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# generates a list of words used as dictionary
|
79
|
+
# @param all_words (see #extract_words)
|
80
|
+
# @param size dictionary size
|
81
|
+
#
|
82
|
+
# @return [Array<String>] list of words
|
83
|
+
def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
|
84
|
+
return unless global_dictionary.empty?
|
85
|
+
|
86
|
+
words = all_words.flatten.group_by{|e| e}.values
|
87
|
+
.sort_by{|e| e.size}
|
88
|
+
.map{|e| [e[0],e.size]}
|
89
|
+
@global_dictionary = words.last(size).map(&:first).reverse
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# extracts the words of all provided data entries
|
94
|
+
# @param data_set [Array<PreprocessedData>] list of preprocessed data
|
95
|
+
#
|
96
|
+
# @return [Array<Array<String>>] list of words per data entry
|
97
|
+
def extract_words data_set
|
98
|
+
data_set.map do |data|
|
99
|
+
extract_words_from_data data
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# fetches all words from one data entry, removes stopwords and very short words
|
105
|
+
# @param data [PreprocessedData] preprocessed data entry
|
106
|
+
#
|
107
|
+
# @return [Array<String>] list of words
|
108
|
+
def extract_words_from_data data
|
109
|
+
(data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
|
110
|
+
end
|
111
|
+
|
112
|
+
def reset
|
113
|
+
@global_dictionary = []
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
#
|
119
|
+
# creates a feature vector for the given words, classification and dictionary
|
120
|
+
# also adds the label
|
121
|
+
# @param words [Array<String>] list of words
|
122
|
+
# @param data [PreprocessedData]
|
123
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
124
|
+
# @param dictionary
|
125
|
+
#
|
126
|
+
# @return [FeatureVector]
|
127
|
+
def make_vector words, data, classification, dictionary=global_dictionary
|
128
|
+
FeatureVector.new(
|
129
|
+
word_data: dictionary.map{|dic_word|
|
130
|
+
words.include?(dic_word) ? 1 : 0
|
131
|
+
},
|
132
|
+
classification_arrays: {
|
133
|
+
function: classification_array(data.ids, :function),
|
134
|
+
industry: classification_array(data.ids, :industry),
|
135
|
+
career_level: classification_array(data.ids, :career_level) },
|
136
|
+
labels: {
|
137
|
+
function: data.labels[:function] ? 1 : 0,
|
138
|
+
industry: data.labels[:industry] ? 1 : 0,
|
139
|
+
career_level: data.labels[:career_level] ? 1 : 0 }
|
140
|
+
).tap{|e| e.send("#{classification}!")}
|
141
|
+
end
|
142
|
+
|
143
|
+
def make_vectors data, &block
|
144
|
+
if @parallel && RUBY_PLATFORM == 'java'
|
145
|
+
Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
|
146
|
+
elsif @parallel
|
147
|
+
Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
|
148
|
+
else
|
149
|
+
data.map.with_index {|e,i| yield e,i }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# creates the classification specific part of the feature vector
|
155
|
+
# @param ids [Hash] hash with classification ids
|
156
|
+
#
|
157
|
+
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
158
|
+
def classification_array(ids, classification)
|
159
|
+
id = ids[classification]
|
160
|
+
Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Selector
|
3
|
+
#
|
4
|
+
# Selector which uses a n-gram dictionary to generate feature vectors
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class WithBinaryEncoding < Selector::Simple
|
9
|
+
|
10
|
+
CLASSIFICATIONS_SIZE = {
|
11
|
+
function: 8, # max id 255, currently 19
|
12
|
+
industry: 16, # max id 65535, currently 14370
|
13
|
+
career_level: 4 } # max id 15, currently 8
|
14
|
+
|
15
|
+
def initialize args={}
|
16
|
+
super
|
17
|
+
end
|
18
|
+
|
19
|
+
def label
|
20
|
+
"simple-WithBinaryEncoding"
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
#
|
25
|
+
# creates the classification specific part of the feature vector
|
26
|
+
# @param ids [Hash] hash with classification ids
|
27
|
+
#
|
28
|
+
# @return [Array<Integer>] binary encoded classification id
|
29
|
+
def classification_array(ids, classification)
|
30
|
+
id = ids[classification]
|
31
|
+
number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
|
32
|
+
end
|
33
|
+
|
34
|
+
def number_to_binary_array(number, size=8)
|
35
|
+
a=[]
|
36
|
+
(size-1).downto(0) do |i|
|
37
|
+
a<<number[i]
|
38
|
+
end
|
39
|
+
a
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
aber
|
2
|
+
als
|
3
|
+
am
|
4
|
+
an
|
5
|
+
auch
|
6
|
+
auf
|
7
|
+
aus
|
8
|
+
bei
|
9
|
+
bin
|
10
|
+
bis
|
11
|
+
bist
|
12
|
+
da
|
13
|
+
dadurch
|
14
|
+
daher
|
15
|
+
darum
|
16
|
+
das
|
17
|
+
daß
|
18
|
+
dass
|
19
|
+
dein
|
20
|
+
deine
|
21
|
+
dem
|
22
|
+
den
|
23
|
+
der
|
24
|
+
des
|
25
|
+
dessen
|
26
|
+
deshalb
|
27
|
+
die
|
28
|
+
dies
|
29
|
+
dieser
|
30
|
+
dieses
|
31
|
+
doch
|
32
|
+
dort
|
33
|
+
du
|
34
|
+
durch
|
35
|
+
ein
|
36
|
+
eine
|
37
|
+
einem
|
38
|
+
einen
|
39
|
+
einer
|
40
|
+
eines
|
41
|
+
er
|
42
|
+
es
|
43
|
+
euer
|
44
|
+
eure
|
45
|
+
für
|
46
|
+
hatte
|
47
|
+
hatten
|
48
|
+
hattest
|
49
|
+
hattet
|
50
|
+
hier hinter
|
51
|
+
ich
|
52
|
+
ihr
|
53
|
+
ihre
|
54
|
+
im
|
55
|
+
in
|
56
|
+
ist
|
57
|
+
ja
|
58
|
+
jede
|
59
|
+
jedem
|
60
|
+
jeden
|
61
|
+
jeder
|
62
|
+
jedes
|
63
|
+
jener
|
64
|
+
jenes
|
65
|
+
jetzt
|
66
|
+
kann
|
67
|
+
kannst
|
68
|
+
können
|
69
|
+
könnt
|
70
|
+
machen
|
71
|
+
mein
|
72
|
+
meine
|
73
|
+
mit
|
74
|
+
muß
|
75
|
+
mußt
|
76
|
+
musst
|
77
|
+
müssen
|
78
|
+
müßt
|
79
|
+
nach
|
80
|
+
nachdem
|
81
|
+
nein
|
82
|
+
nicht
|
83
|
+
nun
|
84
|
+
oder
|
85
|
+
seid
|
86
|
+
sein
|
87
|
+
seine
|
88
|
+
sich
|
89
|
+
sie
|
90
|
+
sind
|
91
|
+
soll
|
92
|
+
sollen
|
93
|
+
sollst
|
94
|
+
sollt
|
95
|
+
sonst
|
96
|
+
soweit
|
97
|
+
sowie
|
98
|
+
und
|
99
|
+
unser unsere
|
100
|
+
unter
|
101
|
+
vom
|
102
|
+
von
|
103
|
+
vor
|
104
|
+
wann
|
105
|
+
warum
|
106
|
+
was
|
107
|
+
weiter
|
108
|
+
weitere
|
109
|
+
wenn
|
110
|
+
wer
|
111
|
+
werde
|
112
|
+
werden
|
113
|
+
werdet
|
114
|
+
weshalb
|
115
|
+
wie
|
116
|
+
wieder
|
117
|
+
wieso
|
118
|
+
wir
|
119
|
+
wird
|
120
|
+
wirst
|
121
|
+
wo
|
122
|
+
woher
|
123
|
+
wohin
|
124
|
+
zu
|
125
|
+
zum
|
126
|
+
zur
|
127
|
+
über
|
@@ -0,0 +1,119 @@
|
|
1
|
+
a
|
2
|
+
able
|
3
|
+
about
|
4
|
+
across
|
5
|
+
after
|
6
|
+
all
|
7
|
+
almost
|
8
|
+
also
|
9
|
+
am
|
10
|
+
among
|
11
|
+
an
|
12
|
+
and
|
13
|
+
any
|
14
|
+
are
|
15
|
+
as
|
16
|
+
at
|
17
|
+
be
|
18
|
+
because
|
19
|
+
been
|
20
|
+
but
|
21
|
+
by
|
22
|
+
can
|
23
|
+
cannot
|
24
|
+
could
|
25
|
+
dear
|
26
|
+
did
|
27
|
+
do
|
28
|
+
does
|
29
|
+
either
|
30
|
+
else
|
31
|
+
ever
|
32
|
+
every
|
33
|
+
for
|
34
|
+
from
|
35
|
+
get
|
36
|
+
got
|
37
|
+
had
|
38
|
+
has
|
39
|
+
have
|
40
|
+
he
|
41
|
+
her
|
42
|
+
hers
|
43
|
+
him
|
44
|
+
his
|
45
|
+
how
|
46
|
+
however
|
47
|
+
i
|
48
|
+
if
|
49
|
+
in
|
50
|
+
into
|
51
|
+
is
|
52
|
+
it
|
53
|
+
its
|
54
|
+
just
|
55
|
+
least
|
56
|
+
let
|
57
|
+
like
|
58
|
+
likely
|
59
|
+
may
|
60
|
+
me
|
61
|
+
might
|
62
|
+
most
|
63
|
+
must
|
64
|
+
my
|
65
|
+
neither
|
66
|
+
no
|
67
|
+
nor
|
68
|
+
not
|
69
|
+
of
|
70
|
+
off
|
71
|
+
often
|
72
|
+
on
|
73
|
+
only
|
74
|
+
or
|
75
|
+
other
|
76
|
+
our
|
77
|
+
own
|
78
|
+
rather
|
79
|
+
said
|
80
|
+
say
|
81
|
+
says
|
82
|
+
she
|
83
|
+
should
|
84
|
+
since
|
85
|
+
so
|
86
|
+
some
|
87
|
+
than
|
88
|
+
that
|
89
|
+
the
|
90
|
+
their
|
91
|
+
them
|
92
|
+
then
|
93
|
+
there
|
94
|
+
these
|
95
|
+
they
|
96
|
+
this
|
97
|
+
tis
|
98
|
+
to
|
99
|
+
too
|
100
|
+
twas
|
101
|
+
us
|
102
|
+
wants
|
103
|
+
was
|
104
|
+
we
|
105
|
+
were
|
106
|
+
what
|
107
|
+
when
|
108
|
+
where
|
109
|
+
which
|
110
|
+
while
|
111
|
+
who
|
112
|
+
whom
|
113
|
+
why
|
114
|
+
will
|
115
|
+
with
|
116
|
+
would
|
117
|
+
yet
|
118
|
+
you
|
119
|
+
your
|