svm_helper 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Guardfile +2 -2
- data/lib/svm_helper/feature_vector.rb +3 -6
- data/lib/svm_helper/interface_helper.rb +0 -14
- data/lib/svm_helper/preprocessed_data.rb +2 -9
- data/lib/svm_helper/preprocessors/id_mapping.rb +35 -0
- data/lib/svm_helper/preprocessors/simple.rb +20 -21
- data/lib/svm_helper/preprocessors.rb +1 -1
- data/lib/svm_helper/selectors/n_gram.rb +1 -1
- data/lib/svm_helper/selectors/simple.rb +16 -22
- data/lib/svm_helper/selectors/with_binary_encoding.rb +3 -4
- data/lib/svm_helper/version.rb +1 -1
- data/spec/factories.rb +7 -11
- data/spec/preprocessors/id_mapping_spec.rb +12 -0
- data/spec/preprocessors/simple_spec.rb +18 -37
- data/spec/selectors/n_gram_spec.rb +1 -1
- data/spec/selectors/simple_spec.rb +6 -5
- data/spec/selectors/with_binary_encoding_spec.rb +3 -3
- data/spec/support/preprocessor_spec.rb +1 -5
- data/spec/support/selector_spec.rb +1 -1
- metadata +18 -8
- checksums.yaml +0 -7
- data/lib/svm_helper/preprocessors/with_industry_map.rb +0 -40
- data/spec/preprocessors/with_industry_map_spec.rb +0 -16
data/Guardfile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
1
|
+
guard 'rspec', cli: "--color --format p", all_after_pass: false do
|
2
|
+
# guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
3
3
|
watch(%r{^spec/.+_spec\.rb$})
|
4
4
|
watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
5
5
|
watch('spec/spec_helper.rb') { 'spec' }
|
@@ -5,13 +5,10 @@ require_relative 'interface_helper'
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
class FeatureVector < InterfaceHelper
|
7
7
|
attribute :word_data
|
8
|
-
attribute :
|
9
|
-
attribute :
|
8
|
+
attribute :classification
|
9
|
+
attribute :label
|
10
10
|
|
11
|
-
def label
|
12
|
-
labels[classification]
|
13
|
-
end
|
14
11
|
def data
|
15
|
-
word_data +
|
12
|
+
word_data + classification
|
16
13
|
end
|
17
14
|
end
|
@@ -30,7 +30,6 @@ class InterfaceHelper
|
|
30
30
|
params.each do |key, value|
|
31
31
|
send("#{key}=", value)
|
32
32
|
end
|
33
|
-
@_attributes[:classification] ||= :function
|
34
33
|
end
|
35
34
|
|
36
35
|
#
|
@@ -41,17 +40,4 @@ class InterfaceHelper
|
|
41
40
|
def == anOther
|
42
41
|
@_attributes.keys.map{ |sym| self.send(sym) == anOther.send(sym)}.reduce(true){|a,e| a && e }
|
43
42
|
end
|
44
|
-
|
45
|
-
def industry!
|
46
|
-
@_attributes[:classification] = :industry
|
47
|
-
end
|
48
|
-
def function!
|
49
|
-
@_attributes[:classification] = :function
|
50
|
-
end
|
51
|
-
def career_level!
|
52
|
-
@_attributes[:classification] = :career_level
|
53
|
-
end
|
54
|
-
def classification
|
55
|
-
@_attributes[:classification]
|
56
|
-
end
|
57
43
|
end
|
@@ -5,13 +5,6 @@ require_relative 'interface_helper'
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
class PreprocessedData < InterfaceHelper
|
7
7
|
attribute :data
|
8
|
-
attribute :
|
9
|
-
attribute :
|
10
|
-
|
11
|
-
def id
|
12
|
-
ids[classification]
|
13
|
-
end
|
14
|
-
def label
|
15
|
-
labels[classification]
|
16
|
-
end
|
8
|
+
attribute :id
|
9
|
+
attribute :label
|
17
10
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Preprocessor
|
3
|
+
#
|
4
|
+
# Preprocessor Base Class
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class IDMapping < Simple
|
9
|
+
attr_reader :id_map
|
10
|
+
|
11
|
+
#
|
12
|
+
# @param args [Hash] options hash
|
13
|
+
# @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
|
14
|
+
def initialize id_map, args={}
|
15
|
+
super(args)
|
16
|
+
@id_map = id_map
|
17
|
+
end
|
18
|
+
|
19
|
+
def map_id(id)
|
20
|
+
@id_map[id]
|
21
|
+
end
|
22
|
+
def label
|
23
|
+
"with_id_mapping"
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def process_job job
|
28
|
+
PreprocessedData.new(
|
29
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
30
|
+
id: map_id(job[:id]),
|
31
|
+
label: job[:label]
|
32
|
+
)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -35,21 +35,26 @@ module Preprocessor
|
|
35
35
|
#
|
36
36
|
# cleans provided jobs
|
37
37
|
# @overload process(jobs, classification)
|
38
|
-
# @param jobs [
|
38
|
+
# @param jobs [Hash] single Job
|
39
|
+
# @option title
|
40
|
+
# @option description
|
41
|
+
# @option id
|
42
|
+
# @option label
|
39
43
|
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
40
44
|
# @overload process(jobs, classification)
|
41
|
-
# @param jobs [Array<
|
45
|
+
# @param jobs [Array<Hash>] list of Jobs
|
42
46
|
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
43
47
|
#
|
44
48
|
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
|
45
|
-
def process jobs
|
46
|
-
if jobs.
|
47
|
-
process_jobs jobs
|
49
|
+
def process jobs
|
50
|
+
if jobs.is_a? Array
|
51
|
+
process_jobs jobs
|
48
52
|
else
|
49
|
-
process_job jobs
|
53
|
+
process_job jobs
|
50
54
|
end
|
51
55
|
end
|
52
56
|
|
57
|
+
|
53
58
|
#
|
54
59
|
# converts string into a cleaner version
|
55
60
|
# @param title [String] job title
|
@@ -84,28 +89,22 @@ module Preprocessor
|
|
84
89
|
end
|
85
90
|
|
86
91
|
private
|
87
|
-
def process_jobs jobs
|
92
|
+
def process_jobs jobs
|
88
93
|
if @parallel && RUBY_PLATFORM == 'java'
|
89
|
-
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job
|
94
|
+
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
|
90
95
|
elsif @parallel
|
91
|
-
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job
|
96
|
+
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
|
92
97
|
else
|
93
|
-
jobs.map {|job| process_job job
|
98
|
+
jobs.map {|job| process_job job }
|
94
99
|
end
|
95
100
|
end
|
96
101
|
|
97
|
-
def process_job job
|
102
|
+
def process_job job
|
98
103
|
PreprocessedData.new(
|
99
|
-
data: [
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
career_level: job.classification_id(:career_level) },
|
104
|
-
labels: {
|
105
|
-
industry: job.label(:industry),
|
106
|
-
function: job.label(:function),
|
107
|
-
career_level: job.label(:career_level) }
|
108
|
-
).tap{|e| e.send("#{classification}!")}
|
104
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
105
|
+
id: job[:id],
|
106
|
+
label: job[:label]
|
107
|
+
)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
end
|
@@ -1,2 +1,2 @@
|
|
1
1
|
require_relative 'preprocessors/simple'
|
2
|
-
require_relative 'preprocessors/
|
2
|
+
require_relative 'preprocessors/id_mapping'
|
@@ -26,7 +26,8 @@ module Selector
|
|
26
26
|
|
27
27
|
attr_accessor :global_dictionary
|
28
28
|
|
29
|
-
def initialize args={}
|
29
|
+
def initialize classification, args={}
|
30
|
+
@classification = classification
|
30
31
|
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
31
32
|
@language = args.fetch(:language){'en'}
|
32
33
|
@parallel = args.fetch(:parallel){false}
|
@@ -43,13 +44,13 @@ module Selector
|
|
43
44
|
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
44
45
|
#
|
45
46
|
# @return [Array<FeatureVector>] list of feature vectors and labels
|
46
|
-
def generate_vectors data_set,
|
47
|
+
def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
47
48
|
words_per_data = extract_words data_set
|
48
49
|
generate_global_dictionary words_per_data, dictionary_size
|
49
50
|
|
50
51
|
make_vectors(words_per_data) do |words,index|
|
51
52
|
word_set = words.uniq
|
52
|
-
make_vector word_set, data_set[index]
|
53
|
+
make_vector word_set, data_set[index]
|
53
54
|
end
|
54
55
|
end
|
55
56
|
|
@@ -60,9 +61,9 @@ module Selector
|
|
60
61
|
# @param dictionary [Array] dictionary to use for this selection
|
61
62
|
#
|
62
63
|
# @return [FeatureVector]
|
63
|
-
def generate_vector data,
|
64
|
+
def generate_vector data, dictionary=global_dictionary
|
64
65
|
word_set = Set.new extract_words_from_data(data)
|
65
|
-
make_vector word_set, data,
|
66
|
+
make_vector word_set, data, dictionary
|
66
67
|
end
|
67
68
|
|
68
69
|
#
|
@@ -109,8 +110,9 @@ module Selector
|
|
109
110
|
(data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
|
110
111
|
end
|
111
112
|
|
112
|
-
def reset
|
113
|
+
def reset classification
|
113
114
|
@global_dictionary = []
|
115
|
+
@classification = classification
|
114
116
|
end
|
115
117
|
|
116
118
|
private
|
@@ -120,24 +122,17 @@ module Selector
|
|
120
122
|
# also adds the label
|
121
123
|
# @param words [Array<String>] list of words
|
122
124
|
# @param data [PreprocessedData]
|
123
|
-
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
124
125
|
# @param dictionary
|
125
126
|
#
|
126
127
|
# @return [FeatureVector]
|
127
|
-
def make_vector words, data,
|
128
|
+
def make_vector words, data, dictionary=global_dictionary
|
128
129
|
FeatureVector.new(
|
129
130
|
word_data: dictionary.map{|dic_word|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
career_level: classification_array(data.ids, :career_level) },
|
136
|
-
labels: {
|
137
|
-
function: data.labels[:function] ? 1 : 0,
|
138
|
-
industry: data.labels[:industry] ? 1 : 0,
|
139
|
-
career_level: data.labels[:career_level] ? 1 : 0 }
|
140
|
-
).tap{|e| e.send("#{classification}!")}
|
131
|
+
words.include?(dic_word) ? 1 : 0
|
132
|
+
},
|
133
|
+
classification: classification_array(data.id),
|
134
|
+
label: data.label ? 1 : 0
|
135
|
+
)
|
141
136
|
end
|
142
137
|
|
143
138
|
def make_vectors data, &block
|
@@ -155,9 +150,8 @@ module Selector
|
|
155
150
|
# @param ids [Hash] hash with classification ids
|
156
151
|
#
|
157
152
|
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
158
|
-
def classification_array(
|
159
|
-
id
|
160
|
-
Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
|
153
|
+
def classification_array(id)
|
154
|
+
Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
|
161
155
|
end
|
162
156
|
end
|
163
157
|
end
|
@@ -12,7 +12,7 @@ module Selector
|
|
12
12
|
industry: 16, # max id 65535, currently 14370
|
13
13
|
career_level: 4 } # max id 15, currently 8
|
14
14
|
|
15
|
-
def initialize args
|
15
|
+
def initialize *args
|
16
16
|
super
|
17
17
|
end
|
18
18
|
|
@@ -26,9 +26,8 @@ module Selector
|
|
26
26
|
# @param ids [Hash] hash with classification ids
|
27
27
|
#
|
28
28
|
# @return [Array<Integer>] binary encoded classification id
|
29
|
-
def classification_array(
|
30
|
-
id
|
31
|
-
number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
|
29
|
+
def classification_array(id)
|
30
|
+
number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
|
32
31
|
end
|
33
32
|
|
34
33
|
def number_to_binary_array(number, size=8)
|
data/lib/svm_helper/version.rb
CHANGED
data/spec/factories.rb
CHANGED
@@ -7,24 +7,20 @@ FactoryGirl.define do
|
|
7
7
|
wrong_function_id 4
|
8
8
|
wrong_career_level nil
|
9
9
|
end
|
10
|
-
factory :job, class:
|
10
|
+
factory :job, class: Hash do
|
11
11
|
title "Meh"
|
12
12
|
description "Foo Bar"
|
13
|
-
|
14
|
-
|
15
|
-
end
|
13
|
+
id 4
|
14
|
+
label true
|
16
15
|
|
17
|
-
|
18
|
-
title "Meh"
|
19
|
-
description "Foo Bar"
|
20
|
-
summary "Really lot of work to do"
|
21
|
-
original_industry_id 1423
|
16
|
+
initialize_with { attributes }
|
22
17
|
end
|
23
18
|
|
19
|
+
|
24
20
|
factory :data, class: PreprocessedData do
|
25
21
|
data ["haus fooo garten baaz pferd fooo"]
|
26
|
-
|
27
|
-
|
22
|
+
id 7
|
23
|
+
label true
|
28
24
|
end
|
29
25
|
factory :data_w_short_words, parent: :data do
|
30
26
|
data ["auto foo pferd bz gooo fooo 2"]
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Preprocessor::IDMapping do
|
4
|
+
it_behaves_like 'a preprocessor'
|
5
|
+
let(:preprocessor) { Preprocessor::IDMapping.new(1423=>3, 523=>54) }
|
6
|
+
let(:job) { FactoryGirl.build(:job) }
|
7
|
+
let(:jobs) { [job] }
|
8
|
+
it "should make use of a industry_map" do
|
9
|
+
preprocessor.expects(:map_id)
|
10
|
+
preprocessor.process(jobs)
|
11
|
+
end
|
12
|
+
end
|
@@ -11,31 +11,18 @@ describe Preprocessor::Simple do
|
|
11
11
|
context do
|
12
12
|
before(:each) do
|
13
13
|
@jobs = FactoryGirl.build_list :job, 3
|
14
|
-
@jobs.each{|e| e.stubs(:classification_id)}
|
15
|
-
@jobs.each{|e| e.stubs(:label)}
|
16
14
|
end
|
17
15
|
it "should work with jobs with quality check" do
|
18
16
|
-> {simple.process(@jobs) }.should_not raise_error
|
19
17
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
18
|
+
it "should set labels to true if quality check exists and label was true" do
|
19
|
+
@jobs.map!{|e| e[:label] = true;e }
|
20
|
+
simple.process(@jobs).each{|e| e.label.should be_true}
|
21
|
+
end
|
22
|
+
it "should set labels to false if quality check exists and label false" do
|
23
|
+
@jobs.map!{|e| e[:label] = false;e }
|
24
|
+
simple.process(@jobs).each{|e| e.label.should be_false}
|
25
25
|
end
|
26
|
-
end
|
27
|
-
|
28
|
-
it "should work with jobs without quality check" do
|
29
|
-
jobs = FactoryGirl.build_list :job_without_job_check, 3
|
30
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
31
|
-
jobs.each{|e| e.stubs(:label)}
|
32
|
-
-> {simple.process(jobs) }.should_not raise_error
|
33
|
-
end
|
34
|
-
it "should set labels to false if no quality check" do
|
35
|
-
jobs = FactoryGirl.build_list :job_without_job_check, 3
|
36
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
37
|
-
jobs.each{|e| e.stubs(:label)}
|
38
|
-
simple.process(jobs).each{|e| e.career_level!; e.label.should be_false}
|
39
26
|
end
|
40
27
|
|
41
28
|
context "processing" do
|
@@ -43,8 +30,6 @@ describe Preprocessor::Simple do
|
|
43
30
|
before(:each) do
|
44
31
|
simple.stubs(:clean_title)
|
45
32
|
simple.stubs(:clean_description)
|
46
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
47
|
-
jobs.each{|e| e.stubs(:label)}
|
48
33
|
end
|
49
34
|
it "should call clean_title on each job" do
|
50
35
|
simple.expects(:clean_title).times(3)
|
@@ -59,7 +44,7 @@ describe Preprocessor::Simple do
|
|
59
44
|
context "#clean_title" do
|
60
45
|
it "should be downcased" do
|
61
46
|
job = FactoryGirl.build(:job_title_downcasing)
|
62
|
-
simple.clean_title(job
|
47
|
+
simple.clean_title(job[:title]).should eq(job[:clean_title])
|
63
48
|
end
|
64
49
|
[ FactoryGirl.build(:job_title_w_gender),
|
65
50
|
FactoryGirl.build(:job_title_w_gender_brackets),
|
@@ -77,8 +62,8 @@ describe Preprocessor::Simple do
|
|
77
62
|
FactoryGirl.build(:job_title_var_0),
|
78
63
|
FactoryGirl.build(:job_title_w_special),
|
79
64
|
FactoryGirl.build(:job_title_w_percent)].each do |job|
|
80
|
-
it "should cleanup '#{job
|
81
|
-
simple.clean_title(job
|
65
|
+
it "should cleanup '#{job[:title]}'" do
|
66
|
+
simple.clean_title(job[:title]).should eq(job[:clean_title])
|
82
67
|
end
|
83
68
|
end
|
84
69
|
end
|
@@ -91,27 +76,27 @@ describe Preprocessor::Simple do
|
|
91
76
|
FactoryGirl.build(:job_description_w_gender) ]
|
92
77
|
}
|
93
78
|
it "should remove html/xml tags" do
|
94
|
-
desc = simple.clean_description(jobs[0]
|
79
|
+
desc = simple.clean_description(jobs[0][:description])
|
95
80
|
desc.should_not match(/<(.*?)>/)
|
96
81
|
end
|
97
82
|
it "should remove new lines" do
|
98
|
-
desc = simple.clean_description(jobs[0]
|
83
|
+
desc = simple.clean_description(jobs[0][:description])
|
99
84
|
desc.should_not match(/\r\n|\n|\r/)
|
100
85
|
end
|
101
86
|
it "should remove all special characters" do
|
102
|
-
desc = simple.clean_description(jobs[2]
|
87
|
+
desc = simple.clean_description(jobs[2][:description])
|
103
88
|
desc.should_not match(/[^a-z öäü]/i)
|
104
89
|
end
|
105
90
|
it "should remove gender tokens" do
|
106
|
-
desc = simple.clean_description(jobs[3]
|
91
|
+
desc = simple.clean_description(jobs[3][:description])
|
107
92
|
desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
|
108
93
|
end
|
109
94
|
it "should remove job code token" do
|
110
|
-
desc = simple.clean_description(jobs[4]
|
95
|
+
desc = simple.clean_description(jobs[4][:description])
|
111
96
|
desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
|
112
97
|
end
|
113
98
|
it "should be downcased" do
|
114
|
-
desc = simple.clean_description(jobs[2]
|
99
|
+
desc = simple.clean_description(jobs[2][:description])
|
115
100
|
desc.should_not match(/[^a-z öäü]/)
|
116
101
|
end
|
117
102
|
end
|
@@ -125,13 +110,9 @@ describe Preprocessor::Simple do
|
|
125
110
|
FactoryGirl.build(:job_description_w_code_token),
|
126
111
|
FactoryGirl.build(:job_description_w_gender) ]
|
127
112
|
}
|
128
|
-
before(:each) do
|
129
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
130
|
-
jobs.each{|e| e.stubs(:label)}
|
131
|
-
end
|
132
113
|
it "should be the same parallelized" do
|
133
|
-
single = simple.process(jobs
|
134
|
-
p_data = parallel.process(jobs
|
114
|
+
single = simple.process(jobs)
|
115
|
+
p_data = parallel.process(jobs)
|
135
116
|
single.each.with_index { |e,i| e.data.should == p_data[i].data }
|
136
117
|
end
|
137
118
|
end
|
@@ -3,7 +3,7 @@ require "spec_helper"
|
|
3
3
|
describe Selector::NGram do
|
4
4
|
it_behaves_like 'a selector'
|
5
5
|
|
6
|
-
let(:ngram) { Selector::NGram.new(gram_size: 3) }
|
6
|
+
let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
|
7
7
|
context "#extract_words_from_data" do
|
8
8
|
it "should generate a list of words from the data" do
|
9
9
|
words = ngram.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -2,8 +2,8 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe Selector::Simple do
|
4
4
|
it_behaves_like 'a selector'
|
5
|
-
|
6
|
-
let(:simple) { Selector::Simple.new }
|
5
|
+
|
6
|
+
let(:simple) { Selector::Simple.new(:function) }
|
7
7
|
it "should have select_feature_vector implemented" do
|
8
8
|
expect { simple.generate_vectors([]) }.to_not raise_error
|
9
9
|
end
|
@@ -53,7 +53,8 @@ describe Selector::Simple do
|
|
53
53
|
context "#generate_vector" do
|
54
54
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
55
55
|
let(:data) { FactoryGirl.build(:data) }
|
56
|
-
let(:
|
56
|
+
let(:simple) { Selector::Simple.new(:career_level) }
|
57
|
+
let(:vector) { simple.generate_vector(data) }
|
57
58
|
|
58
59
|
before(:each) do
|
59
60
|
simple.stubs(:global_dictionary).returns(dictionary)
|
@@ -79,7 +80,7 @@ describe Selector::Simple do
|
|
79
80
|
end
|
80
81
|
context "custom dictionary" do
|
81
82
|
it "should accept a custom dictionary" do
|
82
|
-
vector = simple.generate_vector(data,
|
83
|
+
vector = simple.generate_vector(data, %w(pferd flasche glas))
|
83
84
|
vector.data.should eq([[1,0,0],[0,0,0,0,0,0,1,0]].flatten)
|
84
85
|
end
|
85
86
|
end
|
@@ -106,7 +107,7 @@ describe Selector::Simple do
|
|
106
107
|
simple.generate_vectors(data)
|
107
108
|
end
|
108
109
|
context "parallel" do
|
109
|
-
let(:parallel) { Selector::Simple.new(parallel: true) }
|
110
|
+
let(:parallel) { Selector::Simple.new(:function, parallel: true) }
|
110
111
|
before(:each) do
|
111
112
|
simple.stubs(:global_dictionary).returns(dictionary)
|
112
113
|
parallel.stubs(:global_dictionary).returns(dictionary)
|
@@ -2,11 +2,11 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe Selector::WithBinaryEncoding do
|
4
4
|
it_behaves_like 'a selector'
|
5
|
-
let(:simple) { Selector::WithBinaryEncoding.new }
|
5
|
+
let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
|
6
6
|
|
7
7
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
8
8
|
let(:data) { FactoryGirl.build(:data) }
|
9
|
-
let(:vector) { simple.generate_vector(data)
|
9
|
+
let(:vector) { simple.generate_vector(data) }
|
10
10
|
|
11
11
|
before(:each) do
|
12
12
|
simple.stubs(:global_dictionary).returns(dictionary)
|
@@ -32,7 +32,7 @@ describe Selector::WithBinaryEncoding do
|
|
32
32
|
end
|
33
33
|
context "custom dictionary" do
|
34
34
|
it "should accept a custom dictionary" do
|
35
|
-
vector = simple.generate_vector(data,
|
35
|
+
vector = simple.generate_vector(data, %w(pferd flasche glas))
|
36
36
|
vector.data.should eq([[1,0,0],[0,1,1,1]].flatten)
|
37
37
|
end
|
38
38
|
end
|
@@ -1,14 +1,10 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
shared_examples_for 'a preprocessor' do
|
4
|
-
let(:preprocessor) { described_class.new(
|
4
|
+
let(:preprocessor) { described_class.new(1423=>3, 523=>54) }
|
5
5
|
let(:job) { FactoryGirl.build(:job) }
|
6
6
|
let(:jobs) { [job] }
|
7
7
|
|
8
|
-
before(:each) do
|
9
|
-
job.stubs(:classification_id)
|
10
|
-
job.stubs(:label)
|
11
|
-
end
|
12
8
|
it { preprocessor.should respond_to :process }
|
13
9
|
it "should return a PreprocessedData object" do
|
14
10
|
preprocessor.process(job).should be_a(PreprocessedData)
|
metadata
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svm_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Andreas Eger
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-15 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: parallel
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
19
|
- - ~>
|
18
20
|
- !ruby/object:Gem::Version
|
@@ -20,6 +22,7 @@ dependencies:
|
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
27
|
- - ~>
|
25
28
|
- !ruby/object:Gem::Version
|
@@ -47,8 +50,8 @@ files:
|
|
47
50
|
- lib/svm_helper/interface_helper.rb
|
48
51
|
- lib/svm_helper/preprocessed_data.rb
|
49
52
|
- lib/svm_helper/preprocessors.rb
|
53
|
+
- lib/svm_helper/preprocessors/id_mapping.rb
|
50
54
|
- lib/svm_helper/preprocessors/simple.rb
|
51
|
-
- lib/svm_helper/preprocessors/with_industry_map.rb
|
52
55
|
- lib/svm_helper/selectors.rb
|
53
56
|
- lib/svm_helper/selectors/n_gram.rb
|
54
57
|
- lib/svm_helper/selectors/simple.rb
|
@@ -62,8 +65,8 @@ files:
|
|
62
65
|
- spec/factories/jobs/tmp3.html
|
63
66
|
- spec/factories/jobs_with_description.rb
|
64
67
|
- spec/factories/jobs_with_title.rb
|
68
|
+
- spec/preprocessors/id_mapping_spec.rb
|
65
69
|
- spec/preprocessors/simple_spec.rb
|
66
|
-
- spec/preprocessors/with_industry_map_spec.rb
|
67
70
|
- spec/selectors/n_gram_spec.rb
|
68
71
|
- spec/selectors/simple_spec.rb
|
69
72
|
- spec/selectors/with_binary_encoding_spec.rb
|
@@ -73,26 +76,33 @@ files:
|
|
73
76
|
- svm_helper.gemspec
|
74
77
|
homepage: https://github.com/sch1zo/svm_helper
|
75
78
|
licenses: []
|
76
|
-
metadata: {}
|
77
79
|
post_install_message:
|
78
80
|
rdoc_options: []
|
79
81
|
require_paths:
|
80
82
|
- lib
|
81
83
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
82
85
|
requirements:
|
83
86
|
- - '>='
|
84
87
|
- !ruby/object:Gem::Version
|
85
88
|
version: '0'
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
hash: 2037039748537332986
|
86
92
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
87
94
|
requirements:
|
88
95
|
- - '>='
|
89
96
|
- !ruby/object:Gem::Version
|
90
97
|
version: '0'
|
98
|
+
segments:
|
99
|
+
- 0
|
100
|
+
hash: 2037039748537332986
|
91
101
|
requirements: []
|
92
102
|
rubyforge_project:
|
93
|
-
rubygems_version:
|
103
|
+
rubygems_version: 1.8.25
|
94
104
|
signing_key:
|
95
|
-
specification_version:
|
105
|
+
specification_version: 3
|
96
106
|
summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
|
97
107
|
test_files:
|
98
108
|
- spec/factories.rb
|
@@ -101,8 +111,8 @@ test_files:
|
|
101
111
|
- spec/factories/jobs/tmp3.html
|
102
112
|
- spec/factories/jobs_with_description.rb
|
103
113
|
- spec/factories/jobs_with_title.rb
|
114
|
+
- spec/preprocessors/id_mapping_spec.rb
|
104
115
|
- spec/preprocessors/simple_spec.rb
|
105
|
-
- spec/preprocessors/with_industry_map_spec.rb
|
106
116
|
- spec/selectors/n_gram_spec.rb
|
107
117
|
- spec/selectors/simple_spec.rb
|
108
118
|
- spec/selectors/with_binary_encoding_spec.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: bbfc13983f715f2f0ab66d2a9dbfed543896329e
|
4
|
-
data.tar.gz: 2e6f5c32898cc01a2468ebaa2e82470e01012586
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 8c616e62ff4717808e0b9f29e3d6773c03096934194a1fb706e3eb0ded7527c455ea00039e0a4d39fc08e13f735d4f49cd984d26128e624fbe0b956851893c21
|
7
|
-
data.tar.gz: f28756b0f3f9539f69bcda6ce3f79bdf3ef226ee813f0e5ed7b6f38595f1760e05bbd1abe7ec8ccc5c3a8db8b666871a0d33a755c19c4842a85a600e4749498d
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Preprocessor
|
3
|
-
#
|
4
|
-
# Preprocessor Base Class
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class WithIndustryMap < Simple
|
9
|
-
attr_reader :industry_map
|
10
|
-
|
11
|
-
#
|
12
|
-
# @param args [Hash] options hash
|
13
|
-
# @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
|
14
|
-
def initialize args={}
|
15
|
-
@industry_map = args.fetch(:industry_map){ Hash[Pjpp::Industry.select(:id).all.map(&:id).sort.map.with_index{|e,i| [e,i]}] }
|
16
|
-
end
|
17
|
-
|
18
|
-
def map_industry_id(id)
|
19
|
-
@industry_map[id]
|
20
|
-
end
|
21
|
-
def label
|
22
|
-
"with_industry_map"
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
def process_job job, classification
|
27
|
-
PreprocessedData.new(
|
28
|
-
data: [ clean_title(job.title), clean_description(job.description) ],
|
29
|
-
ids: {
|
30
|
-
industry: map_industry_id(job.classification_id(:industry)),
|
31
|
-
function: job.classification_id(:function),
|
32
|
-
career_level: job.classification_id(:career_level) },
|
33
|
-
labels: {
|
34
|
-
industry: job.label(:industry),
|
35
|
-
function: job.label(:function),
|
36
|
-
career_level: job.label(:career_level) }
|
37
|
-
).tap{|e| e.send("#{classification}!")}
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Preprocessor::WithIndustryMap do
|
4
|
-
it_behaves_like 'a preprocessor'
|
5
|
-
let(:preprocessor) { Preprocessor::WithIndustryMap.new(industry_map: {1423=>3, 523=>54}) }
|
6
|
-
let(:job) { FactoryGirl.build(:job) }
|
7
|
-
let(:jobs) { [job] }
|
8
|
-
before(:each) do
|
9
|
-
job.stubs(:classification_id)
|
10
|
-
job.stubs(:label)
|
11
|
-
end
|
12
|
-
it "should make use of a industry_map" do
|
13
|
-
preprocessor.expects(:map_industry_id)
|
14
|
-
preprocessor.process(jobs)
|
15
|
-
end
|
16
|
-
end
|