svm_helper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +9 -0
  6. data/.versions.conf +4 -0
  7. data/.yardopts +3 -0
  8. data/Gemfile +24 -0
  9. data/Guardfile +17 -0
  10. data/LICENSE.txt +22 -0
  11. data/README.md +41 -0
  12. data/Rakefile +7 -0
  13. data/lib/svm_helper.rb +8 -0
  14. data/lib/svm_helper/feature_vector.rb +17 -0
  15. data/lib/svm_helper/interface_helper.rb +57 -0
  16. data/lib/svm_helper/preprocessed_data.rb +17 -0
  17. data/lib/svm_helper/preprocessors.rb +2 -0
  18. data/lib/svm_helper/preprocessors/simple.rb +111 -0
  19. data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
  20. data/lib/svm_helper/selectors.rb +3 -0
  21. data/lib/svm_helper/selectors/n_gram.rb +31 -0
  22. data/lib/svm_helper/selectors/simple.rb +163 -0
  23. data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
  24. data/lib/svm_helper/stopwords/de +127 -0
  25. data/lib/svm_helper/stopwords/en +119 -0
  26. data/lib/svm_helper/version.rb +3 -0
  27. data/spec/factories.rb +35 -0
  28. data/spec/factories/jobs/tmp.html +42 -0
  29. data/spec/factories/jobs/tmp2.html +20 -0
  30. data/spec/factories/jobs/tmp3.html +34 -0
  31. data/spec/factories/jobs_with_description.rb +20 -0
  32. data/spec/factories/jobs_with_title.rb +72 -0
  33. data/spec/preprocessors/simple_spec.rb +138 -0
  34. data/spec/preprocessors/with_industry_map_spec.rb +16 -0
  35. data/spec/selectors/n_gram_spec.rb +21 -0
  36. data/spec/selectors/simple_spec.rb +121 -0
  37. data/spec/selectors/with_binary_encoding_spec.rb +39 -0
  38. data/spec/spec_helper.rb +14 -0
  39. data/spec/support/preprocessor_spec.rb +21 -0
  40. data/spec/support/selector_spec.rb +21 -0
  41. data/svm_helper.gemspec +21 -0
  42. metadata +112 -0
@@ -0,0 +1,3 @@
1
+ require_relative 'selectors/simple'
2
+ require_relative 'selectors/n_gram'
3
+ require_relative 'selectors/with_binary_encoding'
@@ -0,0 +1,31 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Selector which uses a n-gram dictionary to generate feature vectors
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class NGram < Selector::Simple
9
+ attr_reader :gram_size
10
+
11
+ def initialize args={}
12
+ super
13
+ @gram_size = args.fetch(:gram_size) { 2 }
14
+ end
15
+
16
+ def label
17
+ "ngram"
18
+ end
19
+ #
20
+ # fetches all words snippets from one data entry, removes stopwords and very short words
21
+ # @param data [PreprocessedData]
22
+ # @param gram_size [Integer] gram size
23
+ #
24
+ # @return [Array<String>]
25
+ def extract_words_from_data data, gram_size=@gram_size
26
+ (data.data.flat_map(&:split) - stopwords)
27
+ .delete_if { |e| e.size <= 3 }
28
+ .each_cons(gram_size).map{|e| e.join " " }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,163 @@
1
+ module Selector
2
+ #
3
+ # Selector which uses a simple dictionary to generate feature vectors
4
+ #
5
+ # @author Andreas Eger
6
+ #
7
+ class Simple
8
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
+ # stopword file
10
+ #TODO use File.expand_path
11
+ STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
12
+ # default dictionary size
13
+ DEFAULT_DICTIONARY_SIZE = 800
14
+
15
+ CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
16
+ { function: Pjpp::Function.count,
17
+ industry: Pjpp::Industry.count,
18
+ career_level: Pjpp::CareerLevel.count }
19
+ else
20
+ { function: 19, # 1..19
21
+ industry: 632, # 1..14370 but not all ids used
22
+ career_level: 8 } # 1..8
23
+ end
24
+
25
+
26
+
27
+ attr_accessor :global_dictionary
28
+
29
+ def initialize args={}
30
+ @global_dictionary = args.fetch(:global_dictionary) {[]}
31
+ @language = args.fetch(:language){'en'}
32
+ @parallel = args.fetch(:parallel){false}
33
+ end
34
+
35
+ def label
36
+ "simple"
37
+ end
38
+
39
+ #
40
+ # generates a list of feature vetors and their labels from preprocessed data
41
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
42
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
43
+ # @param dictionary_size [Integer] Size of a dictionary to create if non exists
44
+ #
45
+ # @return [Array<FeatureVector>] list of feature vectors and labels
46
+ def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE
47
+ words_per_data = extract_words data_set
48
+ generate_global_dictionary words_per_data, dictionary_size
49
+
50
+ make_vectors(words_per_data) do |words,index|
51
+ word_set = words.uniq
52
+ make_vector word_set, data_set[index], classification
53
+ end
54
+ end
55
+
56
+ #
57
+ # generates a feature vector with its label
58
+ # @param data [PreprocessedData]
59
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
60
+ # @param dictionary [Array] dictionary to use for this selection
61
+ #
62
+ # @return [FeatureVector]
63
+ def generate_vector data, classification=:function, dictionary=global_dictionary
64
+ word_set = Set.new extract_words_from_data(data)
65
+ make_vector word_set, data, classification, dictionary
66
+ end
67
+
68
+ #
69
+ # loads a txt file with stop words
70
+ # @param location String folder with stopword lists
71
+ #
72
+ # @return [Array<String>] Array of stopwords
73
+ def stopwords(location=STOPWORD_LOCATION)
74
+ @stopwords ||= IO.read(File.join(location,@language)).split
75
+ end
76
+
77
+ #
78
+ # generates a list of words used as dictionary
79
+ # @param all_words (see #extract_words)
80
+ # @param size dictionary size
81
+ #
82
+ # @return [Array<String>] list of words
83
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
84
+ return unless global_dictionary.empty?
85
+
86
+ words = all_words.flatten.group_by{|e| e}.values
87
+ .sort_by{|e| e.size}
88
+ .map{|e| [e[0],e.size]}
89
+ @global_dictionary = words.last(size).map(&:first).reverse
90
+ end
91
+
92
+ #
93
+ # extracts the words of all provided data entries
94
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
95
+ #
96
+ # @return [Array<Array<String>>] list of words per data entry
97
+ def extract_words data_set
98
+ data_set.map do |data|
99
+ extract_words_from_data data
100
+ end
101
+ end
102
+
103
+ #
104
+ # fetches all words from one data entry, removes stopwords and very short words
105
+ # @param data [PreprocessedData] preprocessed data entry
106
+ #
107
+ # @return [Array<String>] list of words
108
+ def extract_words_from_data data
109
+ (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
110
+ end
111
+
112
+ def reset
113
+ @global_dictionary = []
114
+ end
115
+
116
+ private
117
+
118
+ #
119
+ # creates a feature vector for the given words, classification and dictionary
120
+ # also adds the label
121
+ # @param words [Array<String>] list of words
122
+ # @param data [PreprocessedData]
123
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
124
+ # @param dictionary
125
+ #
126
+ # @return [FeatureVector]
127
+ def make_vector words, data, classification, dictionary=global_dictionary
128
+ FeatureVector.new(
129
+ word_data: dictionary.map{|dic_word|
130
+ words.include?(dic_word) ? 1 : 0
131
+ },
132
+ classification_arrays: {
133
+ function: classification_array(data.ids, :function),
134
+ industry: classification_array(data.ids, :industry),
135
+ career_level: classification_array(data.ids, :career_level) },
136
+ labels: {
137
+ function: data.labels[:function] ? 1 : 0,
138
+ industry: data.labels[:industry] ? 1 : 0,
139
+ career_level: data.labels[:career_level] ? 1 : 0 }
140
+ ).tap{|e| e.send("#{classification}!")}
141
+ end
142
+
143
+ def make_vectors data, &block
144
+ if @parallel && RUBY_PLATFORM == 'java'
145
+ Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
146
+ elsif @parallel
147
+ Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
148
+ else
149
+ data.map.with_index {|e,i| yield e,i }
150
+ end
151
+ end
152
+
153
+ #
154
+ # creates the classification specific part of the feature vector
155
+ # @param ids [Hash] hash with classification ids
156
+ #
157
+ # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
158
+ def classification_array(ids, classification)
159
+ id = ids[classification]
160
+ Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,42 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Selector which uses a n-gram dictionary to generate feature vectors
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class WithBinaryEncoding < Selector::Simple
9
+
10
+ CLASSIFICATIONS_SIZE = {
11
+ function: 8, # max id 255, currently 19
12
+ industry: 16, # max id 65535, currently 14370
13
+ career_level: 4 } # max id 15, currently 8
14
+
15
+ def initialize args={}
16
+ super
17
+ end
18
+
19
+ def label
20
+ "simple-WithBinaryEncoding"
21
+ end
22
+
23
+ private
24
+ #
25
+ # creates the classification specific part of the feature vector
26
+ # @param ids [Hash] hash with classification ids
27
+ #
28
+ # @return [Array<Integer>] binary encoded classification id
29
+ def classification_array(ids, classification)
30
+ id = ids[classification]
31
+ number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
32
+ end
33
+
34
+ def number_to_binary_array(number, size=8)
35
+ a=[]
36
+ (size-1).downto(0) do |i|
37
+ a<<number[i]
38
+ end
39
+ a
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,127 @@
1
+ aber
2
+ als
3
+ am
4
+ an
5
+ auch
6
+ auf
7
+ aus
8
+ bei
9
+ bin
10
+ bis
11
+ bist
12
+ da
13
+ dadurch
14
+ daher
15
+ darum
16
+ das
17
+ daß
18
+ dass
19
+ dein
20
+ deine
21
+ dem
22
+ den
23
+ der
24
+ des
25
+ dessen
26
+ deshalb
27
+ die
28
+ dies
29
+ dieser
30
+ dieses
31
+ doch
32
+ dort
33
+ du
34
+ durch
35
+ ein
36
+ eine
37
+ einem
38
+ einen
39
+ einer
40
+ eines
41
+ er
42
+ es
43
+ euer
44
+ eure
45
+ für
46
+ hatte
47
+ hatten
48
+ hattest
49
+ hattet
50
+ hier hinter
51
+ ich
52
+ ihr
53
+ ihre
54
+ im
55
+ in
56
+ ist
57
+ ja
58
+ jede
59
+ jedem
60
+ jeden
61
+ jeder
62
+ jedes
63
+ jener
64
+ jenes
65
+ jetzt
66
+ kann
67
+ kannst
68
+ können
69
+ könnt
70
+ machen
71
+ mein
72
+ meine
73
+ mit
74
+ muß
75
+ mußt
76
+ musst
77
+ müssen
78
+ müßt
79
+ nach
80
+ nachdem
81
+ nein
82
+ nicht
83
+ nun
84
+ oder
85
+ seid
86
+ sein
87
+ seine
88
+ sich
89
+ sie
90
+ sind
91
+ soll
92
+ sollen
93
+ sollst
94
+ sollt
95
+ sonst
96
+ soweit
97
+ sowie
98
+ und
99
+ unser unsere
100
+ unter
101
+ vom
102
+ von
103
+ vor
104
+ wann
105
+ warum
106
+ was
107
+ weiter
108
+ weitere
109
+ wenn
110
+ wer
111
+ werde
112
+ werden
113
+ werdet
114
+ weshalb
115
+ wie
116
+ wieder
117
+ wieso
118
+ wir
119
+ wird
120
+ wirst
121
+ wo
122
+ woher
123
+ wohin
124
+ zu
125
+ zum
126
+ zur
127
+ über
@@ -0,0 +1,119 @@
1
+ a
2
+ able
3
+ about
4
+ across
5
+ after
6
+ all
7
+ almost
8
+ also
9
+ am
10
+ among
11
+ an
12
+ and
13
+ any
14
+ are
15
+ as
16
+ at
17
+ be
18
+ because
19
+ been
20
+ but
21
+ by
22
+ can
23
+ cannot
24
+ could
25
+ dear
26
+ did
27
+ do
28
+ does
29
+ either
30
+ else
31
+ ever
32
+ every
33
+ for
34
+ from
35
+ get
36
+ got
37
+ had
38
+ has
39
+ have
40
+ he
41
+ her
42
+ hers
43
+ him
44
+ his
45
+ how
46
+ however
47
+ i
48
+ if
49
+ in
50
+ into
51
+ is
52
+ it
53
+ its
54
+ just
55
+ least
56
+ let
57
+ like
58
+ likely
59
+ may
60
+ me
61
+ might
62
+ most
63
+ must
64
+ my
65
+ neither
66
+ no
67
+ nor
68
+ not
69
+ of
70
+ off
71
+ often
72
+ on
73
+ only
74
+ or
75
+ other
76
+ our
77
+ own
78
+ rather
79
+ said
80
+ say
81
+ says
82
+ she
83
+ should
84
+ since
85
+ so
86
+ some
87
+ than
88
+ that
89
+ the
90
+ their
91
+ them
92
+ then
93
+ there
94
+ these
95
+ they
96
+ this
97
+ tis
98
+ to
99
+ too
100
+ twas
101
+ us
102
+ wants
103
+ was
104
+ we
105
+ were
106
+ what
107
+ when
108
+ where
109
+ which
110
+ while
111
+ who
112
+ whom
113
+ why
114
+ will
115
+ with
116
+ would
117
+ yet
118
+ you
119
+ your