svm_helper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +9 -0
  6. data/.versions.conf +4 -0
  7. data/.yardopts +3 -0
  8. data/Gemfile +24 -0
  9. data/Guardfile +17 -0
  10. data/LICENSE.txt +22 -0
  11. data/README.md +41 -0
  12. data/Rakefile +7 -0
  13. data/lib/svm_helper.rb +8 -0
  14. data/lib/svm_helper/feature_vector.rb +17 -0
  15. data/lib/svm_helper/interface_helper.rb +57 -0
  16. data/lib/svm_helper/preprocessed_data.rb +17 -0
  17. data/lib/svm_helper/preprocessors.rb +2 -0
  18. data/lib/svm_helper/preprocessors/simple.rb +111 -0
  19. data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
  20. data/lib/svm_helper/selectors.rb +3 -0
  21. data/lib/svm_helper/selectors/n_gram.rb +31 -0
  22. data/lib/svm_helper/selectors/simple.rb +163 -0
  23. data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
  24. data/lib/svm_helper/stopwords/de +127 -0
  25. data/lib/svm_helper/stopwords/en +119 -0
  26. data/lib/svm_helper/version.rb +3 -0
  27. data/spec/factories.rb +35 -0
  28. data/spec/factories/jobs/tmp.html +42 -0
  29. data/spec/factories/jobs/tmp2.html +20 -0
  30. data/spec/factories/jobs/tmp3.html +34 -0
  31. data/spec/factories/jobs_with_description.rb +20 -0
  32. data/spec/factories/jobs_with_title.rb +72 -0
  33. data/spec/preprocessors/simple_spec.rb +138 -0
  34. data/spec/preprocessors/with_industry_map_spec.rb +16 -0
  35. data/spec/selectors/n_gram_spec.rb +21 -0
  36. data/spec/selectors/simple_spec.rb +121 -0
  37. data/spec/selectors/with_binary_encoding_spec.rb +39 -0
  38. data/spec/spec_helper.rb +14 -0
  39. data/spec/support/preprocessor_spec.rb +21 -0
  40. data/spec/support/selector_spec.rb +21 -0
  41. data/svm_helper.gemspec +21 -0
  42. metadata +112 -0
@@ -0,0 +1,3 @@
1
+ require_relative 'selectors/simple'
2
+ require_relative 'selectors/n_gram'
3
+ require_relative 'selectors/with_binary_encoding'
@@ -0,0 +1,31 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Selector which uses a n-gram dictionary to generate feature vectors
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class NGram < Selector::Simple
9
+ attr_reader :gram_size
10
+
11
+ def initialize args={}
12
+ super
13
+ @gram_size = args.fetch(:gram_size) { 2 }
14
+ end
15
+
16
+ def label
17
+ "ngram"
18
+ end
19
+ #
20
+ # fetches all words snippets from one data entry, removes stopwords and very short words
21
+ # @param data [PreprocessedData]
22
+ # @param gram_size [Integer] gram size
23
+ #
24
+ # @return [Array<String>]
25
+ def extract_words_from_data data, gram_size=@gram_size
26
+ (data.data.flat_map(&:split) - stopwords)
27
+ .delete_if { |e| e.size <= 3 }
28
+ .each_cons(gram_size).map{|e| e.join " " }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,163 @@
1
+ module Selector
2
+ #
3
+ # Selector which uses a simple dictionary to generate feature vectors
4
+ #
5
+ # @author Andreas Eger
6
+ #
7
+ class Simple
8
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
9
+ # stopword file
10
+ #TODO use File.expand_path
11
+ STOPWORD_LOCATION = File.join(File.dirname(__FILE__),'..','stopwords')
12
+ # default dictionary size
13
+ DEFAULT_DICTIONARY_SIZE = 800
14
+
15
+ CLASSIFICATIONS_SIZE= if defined?(Pjpp) == 'constant'
16
+ { function: Pjpp::Function.count,
17
+ industry: Pjpp::Industry.count,
18
+ career_level: Pjpp::CareerLevel.count }
19
+ else
20
+ { function: 19, # 1..19
21
+ industry: 632, # 1..14370 but not all ids used
22
+ career_level: 8 } # 1..8
23
+ end
24
+
25
+
26
+
27
+ attr_accessor :global_dictionary
28
+
29
+ def initialize args={}
30
+ @global_dictionary = args.fetch(:global_dictionary) {[]}
31
+ @language = args.fetch(:language){'en'}
32
+ @parallel = args.fetch(:parallel){false}
33
+ end
34
+
35
+ def label
36
+ "simple"
37
+ end
38
+
39
+ #
40
+ # generates a list of feature vetors and their labels from preprocessed data
41
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
42
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
43
+ # @param dictionary_size [Integer] Size of a dictionary to create if non exists
44
+ #
45
+ # @return [Array<FeatureVector>] list of feature vectors and labels
46
+ def generate_vectors data_set, classification=:function, dictionary_size=DEFAULT_DICTIONARY_SIZE
47
+ words_per_data = extract_words data_set
48
+ generate_global_dictionary words_per_data, dictionary_size
49
+
50
+ make_vectors(words_per_data) do |words,index|
51
+ word_set = words.uniq
52
+ make_vector word_set, data_set[index], classification
53
+ end
54
+ end
55
+
56
+ #
57
+ # generates a feature vector with its label
58
+ # @param data [PreprocessedData]
59
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
60
+ # @param dictionary [Array] dictionary to use for this selection
61
+ #
62
+ # @return [FeatureVector]
63
+ def generate_vector data, classification=:function, dictionary=global_dictionary
64
+ word_set = Set.new extract_words_from_data(data)
65
+ make_vector word_set, data, classification, dictionary
66
+ end
67
+
68
+ #
69
+ # loads a txt file with stop words
70
+ # @param location String folder with stopword lists
71
+ #
72
+ # @return [Array<String>] Array of stopwords
73
+ def stopwords(location=STOPWORD_LOCATION)
74
+ @stopwords ||= IO.read(File.join(location,@language)).split
75
+ end
76
+
77
+ #
78
+ # generates a list of words used as dictionary
79
+ # @param all_words (see #extract_words)
80
+ # @param size dictionary size
81
+ #
82
+ # @return [Array<String>] list of words
83
+ def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
84
+ return unless global_dictionary.empty?
85
+
86
+ words = all_words.flatten.group_by{|e| e}.values
87
+ .sort_by{|e| e.size}
88
+ .map{|e| [e[0],e.size]}
89
+ @global_dictionary = words.last(size).map(&:first).reverse
90
+ end
91
+
92
+ #
93
+ # extracts the words of all provided data entries
94
+ # @param data_set [Array<PreprocessedData>] list of preprocessed data
95
+ #
96
+ # @return [Array<Array<String>>] list of words per data entry
97
+ def extract_words data_set
98
+ data_set.map do |data|
99
+ extract_words_from_data data
100
+ end
101
+ end
102
+
103
+ #
104
+ # fetches all words from one data entry, removes stopwords and very short words
105
+ # @param data [PreprocessedData] preprocessed data entry
106
+ #
107
+ # @return [Array<String>] list of words
108
+ def extract_words_from_data data
109
+ (data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
110
+ end
111
+
112
+ def reset
113
+ @global_dictionary = []
114
+ end
115
+
116
+ private
117
+
118
+ #
119
+ # creates a feature vector for the given words, classification and dictionary
120
+ # also adds the label
121
+ # @param words [Array<String>] list of words
122
+ # @param data [PreprocessedData]
123
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
124
+ # @param dictionary
125
+ #
126
+ # @return [FeatureVector]
127
+ def make_vector words, data, classification, dictionary=global_dictionary
128
+ FeatureVector.new(
129
+ word_data: dictionary.map{|dic_word|
130
+ words.include?(dic_word) ? 1 : 0
131
+ },
132
+ classification_arrays: {
133
+ function: classification_array(data.ids, :function),
134
+ industry: classification_array(data.ids, :industry),
135
+ career_level: classification_array(data.ids, :career_level) },
136
+ labels: {
137
+ function: data.labels[:function] ? 1 : 0,
138
+ industry: data.labels[:industry] ? 1 : 0,
139
+ career_level: data.labels[:career_level] ? 1 : 0 }
140
+ ).tap{|e| e.send("#{classification}!")}
141
+ end
142
+
143
+ def make_vectors data, &block
144
+ if @parallel && RUBY_PLATFORM == 'java'
145
+ Parallel.map_with_index(data, in_threads: THREAD_COUNT ){|e,i| yield e,i }
146
+ elsif @parallel
147
+ Parallel.map_with_index(data, in_processes: THREAD_COUNT ){|e,i| yield e,i }
148
+ else
149
+ data.map.with_index {|e,i| yield e,i }
150
+ end
151
+ end
152
+
153
+ #
154
+ # creates the classification specific part of the feature vector
155
+ # @param ids [Hash] hash with classification ids
156
+ #
157
+ # @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
158
+ def classification_array(ids, classification)
159
+ id = ids[classification]
160
+ Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,42 @@
1
+ require_relative 'simple'
2
+ module Selector
3
+ #
4
+ # Selector which uses a n-gram dictionary to generate feature vectors
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class WithBinaryEncoding < Selector::Simple
9
+
10
+ CLASSIFICATIONS_SIZE = {
11
+ function: 8, # max id 255, currently 19
12
+ industry: 16, # max id 65535, currently 14370
13
+ career_level: 4 } # max id 15, currently 8
14
+
15
+ def initialize args={}
16
+ super
17
+ end
18
+
19
+ def label
20
+ "simple-WithBinaryEncoding"
21
+ end
22
+
23
+ private
24
+ #
25
+ # creates the classification specific part of the feature vector
26
+ # @param ids [Hash] hash with classification ids
27
+ #
28
+ # @return [Array<Integer>] binary encoded classification id
29
+ def classification_array(ids, classification)
30
+ id = ids[classification]
31
+ number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
32
+ end
33
+
34
+ def number_to_binary_array(number, size=8)
35
+ a=[]
36
+ (size-1).downto(0) do |i|
37
+ a<<number[i]
38
+ end
39
+ a
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,127 @@
1
+ aber
2
+ als
3
+ am
4
+ an
5
+ auch
6
+ auf
7
+ aus
8
+ bei
9
+ bin
10
+ bis
11
+ bist
12
+ da
13
+ dadurch
14
+ daher
15
+ darum
16
+ das
17
+ daß
18
+ dass
19
+ dein
20
+ deine
21
+ dem
22
+ den
23
+ der
24
+ des
25
+ dessen
26
+ deshalb
27
+ die
28
+ dies
29
+ dieser
30
+ dieses
31
+ doch
32
+ dort
33
+ du
34
+ durch
35
+ ein
36
+ eine
37
+ einem
38
+ einen
39
+ einer
40
+ eines
41
+ er
42
+ es
43
+ euer
44
+ eure
45
+ für
46
+ hatte
47
+ hatten
48
+ hattest
49
+ hattet
50
+ hier hinter
51
+ ich
52
+ ihr
53
+ ihre
54
+ im
55
+ in
56
+ ist
57
+ ja
58
+ jede
59
+ jedem
60
+ jeden
61
+ jeder
62
+ jedes
63
+ jener
64
+ jenes
65
+ jetzt
66
+ kann
67
+ kannst
68
+ können
69
+ könnt
70
+ machen
71
+ mein
72
+ meine
73
+ mit
74
+ muß
75
+ mußt
76
+ musst
77
+ müssen
78
+ müßt
79
+ nach
80
+ nachdem
81
+ nein
82
+ nicht
83
+ nun
84
+ oder
85
+ seid
86
+ sein
87
+ seine
88
+ sich
89
+ sie
90
+ sind
91
+ soll
92
+ sollen
93
+ sollst
94
+ sollt
95
+ sonst
96
+ soweit
97
+ sowie
98
+ und
99
+ unser unsere
100
+ unter
101
+ vom
102
+ von
103
+ vor
104
+ wann
105
+ warum
106
+ was
107
+ weiter
108
+ weitere
109
+ wenn
110
+ wer
111
+ werde
112
+ werden
113
+ werdet
114
+ weshalb
115
+ wie
116
+ wieder
117
+ wieso
118
+ wir
119
+ wird
120
+ wirst
121
+ wo
122
+ woher
123
+ wohin
124
+ zu
125
+ zum
126
+ zur
127
+ über
@@ -0,0 +1,119 @@
1
+ a
2
+ able
3
+ about
4
+ across
5
+ after
6
+ all
7
+ almost
8
+ also
9
+ am
10
+ among
11
+ an
12
+ and
13
+ any
14
+ are
15
+ as
16
+ at
17
+ be
18
+ because
19
+ been
20
+ but
21
+ by
22
+ can
23
+ cannot
24
+ could
25
+ dear
26
+ did
27
+ do
28
+ does
29
+ either
30
+ else
31
+ ever
32
+ every
33
+ for
34
+ from
35
+ get
36
+ got
37
+ had
38
+ has
39
+ have
40
+ he
41
+ her
42
+ hers
43
+ him
44
+ his
45
+ how
46
+ however
47
+ i
48
+ if
49
+ in
50
+ into
51
+ is
52
+ it
53
+ its
54
+ just
55
+ least
56
+ let
57
+ like
58
+ likely
59
+ may
60
+ me
61
+ might
62
+ most
63
+ must
64
+ my
65
+ neither
66
+ no
67
+ nor
68
+ not
69
+ of
70
+ off
71
+ often
72
+ on
73
+ only
74
+ or
75
+ other
76
+ our
77
+ own
78
+ rather
79
+ said
80
+ say
81
+ says
82
+ she
83
+ should
84
+ since
85
+ so
86
+ some
87
+ than
88
+ that
89
+ the
90
+ their
91
+ them
92
+ then
93
+ there
94
+ these
95
+ they
96
+ this
97
+ tis
98
+ to
99
+ too
100
+ twas
101
+ us
102
+ wants
103
+ was
104
+ we
105
+ were
106
+ what
107
+ when
108
+ where
109
+ which
110
+ while
111
+ who
112
+ whom
113
+ why
114
+ will
115
+ with
116
+ would
117
+ yet
118
+ you
119
+ your