svm_helper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Guardfile +2 -2
- data/lib/svm_helper/feature_vector.rb +3 -6
- data/lib/svm_helper/interface_helper.rb +0 -14
- data/lib/svm_helper/preprocessed_data.rb +2 -9
- data/lib/svm_helper/preprocessors/id_mapping.rb +35 -0
- data/lib/svm_helper/preprocessors/simple.rb +20 -21
- data/lib/svm_helper/preprocessors.rb +1 -1
- data/lib/svm_helper/selectors/n_gram.rb +1 -1
- data/lib/svm_helper/selectors/simple.rb +16 -22
- data/lib/svm_helper/selectors/with_binary_encoding.rb +3 -4
- data/lib/svm_helper/version.rb +1 -1
- data/spec/factories.rb +7 -11
- data/spec/preprocessors/id_mapping_spec.rb +12 -0
- data/spec/preprocessors/simple_spec.rb +18 -37
- data/spec/selectors/n_gram_spec.rb +1 -1
- data/spec/selectors/simple_spec.rb +6 -5
- data/spec/selectors/with_binary_encoding_spec.rb +3 -3
- data/spec/support/preprocessor_spec.rb +1 -5
- data/spec/support/selector_spec.rb +1 -1
- metadata +18 -8
- checksums.yaml +0 -7
- data/lib/svm_helper/preprocessors/with_industry_map.rb +0 -40
- data/spec/preprocessors/with_industry_map_spec.rb +0 -16
data/Guardfile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
1
|
+
guard 'rspec', cli: "--color --format p", all_after_pass: false do
|
2
|
+
# guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
3
3
|
watch(%r{^spec/.+_spec\.rb$})
|
4
4
|
watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
5
5
|
watch('spec/spec_helper.rb') { 'spec' }
|
@@ -5,13 +5,10 @@ require_relative 'interface_helper'
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
class FeatureVector < InterfaceHelper
|
7
7
|
attribute :word_data
|
8
|
-
attribute :
|
9
|
-
attribute :
|
8
|
+
attribute :classification
|
9
|
+
attribute :label
|
10
10
|
|
11
|
-
def label
|
12
|
-
labels[classification]
|
13
|
-
end
|
14
11
|
def data
|
15
|
-
word_data +
|
12
|
+
word_data + classification
|
16
13
|
end
|
17
14
|
end
|
@@ -30,7 +30,6 @@ class InterfaceHelper
|
|
30
30
|
params.each do |key, value|
|
31
31
|
send("#{key}=", value)
|
32
32
|
end
|
33
|
-
@_attributes[:classification] ||= :function
|
34
33
|
end
|
35
34
|
|
36
35
|
#
|
@@ -41,17 +40,4 @@ class InterfaceHelper
|
|
41
40
|
def == anOther
|
42
41
|
@_attributes.keys.map{ |sym| self.send(sym) == anOther.send(sym)}.reduce(true){|a,e| a && e }
|
43
42
|
end
|
44
|
-
|
45
|
-
def industry!
|
46
|
-
@_attributes[:classification] = :industry
|
47
|
-
end
|
48
|
-
def function!
|
49
|
-
@_attributes[:classification] = :function
|
50
|
-
end
|
51
|
-
def career_level!
|
52
|
-
@_attributes[:classification] = :career_level
|
53
|
-
end
|
54
|
-
def classification
|
55
|
-
@_attributes[:classification]
|
56
|
-
end
|
57
43
|
end
|
@@ -5,13 +5,6 @@ require_relative 'interface_helper'
|
|
5
5
|
# @author Andreas Eger
|
6
6
|
class PreprocessedData < InterfaceHelper
|
7
7
|
attribute :data
|
8
|
-
attribute :
|
9
|
-
attribute :
|
10
|
-
|
11
|
-
def id
|
12
|
-
ids[classification]
|
13
|
-
end
|
14
|
-
def label
|
15
|
-
labels[classification]
|
16
|
-
end
|
8
|
+
attribute :id
|
9
|
+
attribute :label
|
17
10
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Preprocessor
|
3
|
+
#
|
4
|
+
# Preprocessor Base Class
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class IDMapping < Simple
|
9
|
+
attr_reader :id_map
|
10
|
+
|
11
|
+
#
|
12
|
+
# @param args [Hash] options hash
|
13
|
+
# @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
|
14
|
+
def initialize id_map, args={}
|
15
|
+
super(args)
|
16
|
+
@id_map = id_map
|
17
|
+
end
|
18
|
+
|
19
|
+
def map_id(id)
|
20
|
+
@id_map[id]
|
21
|
+
end
|
22
|
+
def label
|
23
|
+
"with_id_mapping"
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def process_job job
|
28
|
+
PreprocessedData.new(
|
29
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
30
|
+
id: map_id(job[:id]),
|
31
|
+
label: job[:label]
|
32
|
+
)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -35,21 +35,26 @@ module Preprocessor
|
|
35
35
|
#
|
36
36
|
# cleans provided jobs
|
37
37
|
# @overload process(jobs, classification)
|
38
|
-
# @param jobs [
|
38
|
+
# @param jobs [Hash] single Job
|
39
|
+
# @option title
|
40
|
+
# @option description
|
41
|
+
# @option id
|
42
|
+
# @option label
|
39
43
|
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
40
44
|
# @overload process(jobs, classification)
|
41
|
-
# @param jobs [Array<
|
45
|
+
# @param jobs [Array<Hash>] list of Jobs
|
42
46
|
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
43
47
|
#
|
44
48
|
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
|
45
|
-
def process jobs
|
46
|
-
if jobs.
|
47
|
-
process_jobs jobs
|
49
|
+
def process jobs
|
50
|
+
if jobs.is_a? Array
|
51
|
+
process_jobs jobs
|
48
52
|
else
|
49
|
-
process_job jobs
|
53
|
+
process_job jobs
|
50
54
|
end
|
51
55
|
end
|
52
56
|
|
57
|
+
|
53
58
|
#
|
54
59
|
# converts string into a cleaner version
|
55
60
|
# @param title [String] job title
|
@@ -84,28 +89,22 @@ module Preprocessor
|
|
84
89
|
end
|
85
90
|
|
86
91
|
private
|
87
|
-
def process_jobs jobs
|
92
|
+
def process_jobs jobs
|
88
93
|
if @parallel && RUBY_PLATFORM == 'java'
|
89
|
-
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job
|
94
|
+
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job }
|
90
95
|
elsif @parallel
|
91
|
-
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job
|
96
|
+
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job }
|
92
97
|
else
|
93
|
-
jobs.map {|job| process_job job
|
98
|
+
jobs.map {|job| process_job job }
|
94
99
|
end
|
95
100
|
end
|
96
101
|
|
97
|
-
def process_job job
|
102
|
+
def process_job job
|
98
103
|
PreprocessedData.new(
|
99
|
-
data: [
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
career_level: job.classification_id(:career_level) },
|
104
|
-
labels: {
|
105
|
-
industry: job.label(:industry),
|
106
|
-
function: job.label(:function),
|
107
|
-
career_level: job.label(:career_level) }
|
108
|
-
).tap{|e| e.send("#{classification}!")}
|
104
|
+
data: [clean_title(job[:title]), clean_description(job[:description])],
|
105
|
+
id: job[:id],
|
106
|
+
label: job[:label]
|
107
|
+
)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
end
|
@@ -1,2 +1,2 @@
|
|
1
1
|
require_relative 'preprocessors/simple'
|
2
|
-
require_relative 'preprocessors/
|
2
|
+
require_relative 'preprocessors/id_mapping'
|
@@ -26,7 +26,8 @@ module Selector
|
|
26
26
|
|
27
27
|
attr_accessor :global_dictionary
|
28
28
|
|
29
|
-
def initialize args={}
|
29
|
+
def initialize classification, args={}
|
30
|
+
@classification = classification
|
30
31
|
@global_dictionary = args.fetch(:global_dictionary) {[]}
|
31
32
|
@language = args.fetch(:language){'en'}
|
32
33
|
@parallel = args.fetch(:parallel){false}
|
@@ -43,13 +44,13 @@ module Selector
|
|
43
44
|
# @param dictionary_size [Integer] Size of a dictionary to create if non exists
|
44
45
|
#
|
45
46
|
# @return [Array<FeatureVector>] list of feature vectors and labels
|
46
|
-
def generate_vectors data_set,
|
47
|
+
def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
|
47
48
|
words_per_data = extract_words data_set
|
48
49
|
generate_global_dictionary words_per_data, dictionary_size
|
49
50
|
|
50
51
|
make_vectors(words_per_data) do |words,index|
|
51
52
|
word_set = words.uniq
|
52
|
-
make_vector word_set, data_set[index]
|
53
|
+
make_vector word_set, data_set[index]
|
53
54
|
end
|
54
55
|
end
|
55
56
|
|
@@ -60,9 +61,9 @@ module Selector
|
|
60
61
|
# @param dictionary [Array] dictionary to use for this selection
|
61
62
|
#
|
62
63
|
# @return [FeatureVector]
|
63
|
-
def generate_vector data,
|
64
|
+
def generate_vector data, dictionary=global_dictionary
|
64
65
|
word_set = Set.new extract_words_from_data(data)
|
65
|
-
make_vector word_set, data,
|
66
|
+
make_vector word_set, data, dictionary
|
66
67
|
end
|
67
68
|
|
68
69
|
#
|
@@ -109,8 +110,9 @@ module Selector
|
|
109
110
|
(data.data.flat_map(&:split) - stopwords).delete_if { |e| e.size <= 3 }
|
110
111
|
end
|
111
112
|
|
112
|
-
def reset
|
113
|
+
def reset classification
|
113
114
|
@global_dictionary = []
|
115
|
+
@classification = classification
|
114
116
|
end
|
115
117
|
|
116
118
|
private
|
@@ -120,24 +122,17 @@ module Selector
|
|
120
122
|
# also adds the label
|
121
123
|
# @param words [Array<String>] list of words
|
122
124
|
# @param data [PreprocessedData]
|
123
|
-
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
124
125
|
# @param dictionary
|
125
126
|
#
|
126
127
|
# @return [FeatureVector]
|
127
|
-
def make_vector words, data,
|
128
|
+
def make_vector words, data, dictionary=global_dictionary
|
128
129
|
FeatureVector.new(
|
129
130
|
word_data: dictionary.map{|dic_word|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
career_level: classification_array(data.ids, :career_level) },
|
136
|
-
labels: {
|
137
|
-
function: data.labels[:function] ? 1 : 0,
|
138
|
-
industry: data.labels[:industry] ? 1 : 0,
|
139
|
-
career_level: data.labels[:career_level] ? 1 : 0 }
|
140
|
-
).tap{|e| e.send("#{classification}!")}
|
131
|
+
words.include?(dic_word) ? 1 : 0
|
132
|
+
},
|
133
|
+
classification: classification_array(data.id),
|
134
|
+
label: data.label ? 1 : 0
|
135
|
+
)
|
141
136
|
end
|
142
137
|
|
143
138
|
def make_vectors data, &block
|
@@ -155,9 +150,8 @@ module Selector
|
|
155
150
|
# @param ids [Hash] hash with classification ids
|
156
151
|
#
|
157
152
|
# @return [Array<Integer>] list of size=count(classifcation_ids) with only one not zero item
|
158
|
-
def classification_array(
|
159
|
-
id
|
160
|
-
Array.new(CLASSIFICATIONS_SIZE[classification]){|n| n==(id-1) ? 1 : 0}
|
153
|
+
def classification_array(id)
|
154
|
+
Array.new(CLASSIFICATIONS_SIZE[@classification]){|n| n==(id-1) ? 1 : 0}
|
161
155
|
end
|
162
156
|
end
|
163
157
|
end
|
@@ -12,7 +12,7 @@ module Selector
|
|
12
12
|
industry: 16, # max id 65535, currently 14370
|
13
13
|
career_level: 4 } # max id 15, currently 8
|
14
14
|
|
15
|
-
def initialize args
|
15
|
+
def initialize *args
|
16
16
|
super
|
17
17
|
end
|
18
18
|
|
@@ -26,9 +26,8 @@ module Selector
|
|
26
26
|
# @param ids [Hash] hash with classification ids
|
27
27
|
#
|
28
28
|
# @return [Array<Integer>] binary encoded classification id
|
29
|
-
def classification_array(
|
30
|
-
id
|
31
|
-
number_to_binary_array(id, CLASSIFICATIONS_SIZE[classification])
|
29
|
+
def classification_array(id)
|
30
|
+
number_to_binary_array(id, CLASSIFICATIONS_SIZE[@classification])
|
32
31
|
end
|
33
32
|
|
34
33
|
def number_to_binary_array(number, size=8)
|
data/lib/svm_helper/version.rb
CHANGED
data/spec/factories.rb
CHANGED
@@ -7,24 +7,20 @@ FactoryGirl.define do
|
|
7
7
|
wrong_function_id 4
|
8
8
|
wrong_career_level nil
|
9
9
|
end
|
10
|
-
factory :job, class:
|
10
|
+
factory :job, class: Hash do
|
11
11
|
title "Meh"
|
12
12
|
description "Foo Bar"
|
13
|
-
|
14
|
-
|
15
|
-
end
|
13
|
+
id 4
|
14
|
+
label true
|
16
15
|
|
17
|
-
|
18
|
-
title "Meh"
|
19
|
-
description "Foo Bar"
|
20
|
-
summary "Really lot of work to do"
|
21
|
-
original_industry_id 1423
|
16
|
+
initialize_with { attributes }
|
22
17
|
end
|
23
18
|
|
19
|
+
|
24
20
|
factory :data, class: PreprocessedData do
|
25
21
|
data ["haus fooo garten baaz pferd fooo"]
|
26
|
-
|
27
|
-
|
22
|
+
id 7
|
23
|
+
label true
|
28
24
|
end
|
29
25
|
factory :data_w_short_words, parent: :data do
|
30
26
|
data ["auto foo pferd bz gooo fooo 2"]
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Preprocessor::IDMapping do
|
4
|
+
it_behaves_like 'a preprocessor'
|
5
|
+
let(:preprocessor) { Preprocessor::IDMapping.new(1423=>3, 523=>54) }
|
6
|
+
let(:job) { FactoryGirl.build(:job) }
|
7
|
+
let(:jobs) { [job] }
|
8
|
+
it "should make use of a industry_map" do
|
9
|
+
preprocessor.expects(:map_id)
|
10
|
+
preprocessor.process(jobs)
|
11
|
+
end
|
12
|
+
end
|
@@ -11,31 +11,18 @@ describe Preprocessor::Simple do
|
|
11
11
|
context do
|
12
12
|
before(:each) do
|
13
13
|
@jobs = FactoryGirl.build_list :job, 3
|
14
|
-
@jobs.each{|e| e.stubs(:classification_id)}
|
15
|
-
@jobs.each{|e| e.stubs(:label)}
|
16
14
|
end
|
17
15
|
it "should work with jobs with quality check" do
|
18
16
|
-> {simple.process(@jobs) }.should_not raise_error
|
19
17
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
18
|
+
it "should set labels to true if quality check exists and label was true" do
|
19
|
+
@jobs.map!{|e| e[:label] = true;e }
|
20
|
+
simple.process(@jobs).each{|e| e.label.should be_true}
|
21
|
+
end
|
22
|
+
it "should set labels to false if quality check exists and label false" do
|
23
|
+
@jobs.map!{|e| e[:label] = false;e }
|
24
|
+
simple.process(@jobs).each{|e| e.label.should be_false}
|
25
25
|
end
|
26
|
-
end
|
27
|
-
|
28
|
-
it "should work with jobs without quality check" do
|
29
|
-
jobs = FactoryGirl.build_list :job_without_job_check, 3
|
30
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
31
|
-
jobs.each{|e| e.stubs(:label)}
|
32
|
-
-> {simple.process(jobs) }.should_not raise_error
|
33
|
-
end
|
34
|
-
it "should set labels to false if no quality check" do
|
35
|
-
jobs = FactoryGirl.build_list :job_without_job_check, 3
|
36
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
37
|
-
jobs.each{|e| e.stubs(:label)}
|
38
|
-
simple.process(jobs).each{|e| e.career_level!; e.label.should be_false}
|
39
26
|
end
|
40
27
|
|
41
28
|
context "processing" do
|
@@ -43,8 +30,6 @@ describe Preprocessor::Simple do
|
|
43
30
|
before(:each) do
|
44
31
|
simple.stubs(:clean_title)
|
45
32
|
simple.stubs(:clean_description)
|
46
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
47
|
-
jobs.each{|e| e.stubs(:label)}
|
48
33
|
end
|
49
34
|
it "should call clean_title on each job" do
|
50
35
|
simple.expects(:clean_title).times(3)
|
@@ -59,7 +44,7 @@ describe Preprocessor::Simple do
|
|
59
44
|
context "#clean_title" do
|
60
45
|
it "should be downcased" do
|
61
46
|
job = FactoryGirl.build(:job_title_downcasing)
|
62
|
-
simple.clean_title(job
|
47
|
+
simple.clean_title(job[:title]).should eq(job[:clean_title])
|
63
48
|
end
|
64
49
|
[ FactoryGirl.build(:job_title_w_gender),
|
65
50
|
FactoryGirl.build(:job_title_w_gender_brackets),
|
@@ -77,8 +62,8 @@ describe Preprocessor::Simple do
|
|
77
62
|
FactoryGirl.build(:job_title_var_0),
|
78
63
|
FactoryGirl.build(:job_title_w_special),
|
79
64
|
FactoryGirl.build(:job_title_w_percent)].each do |job|
|
80
|
-
it "should cleanup '#{job
|
81
|
-
simple.clean_title(job
|
65
|
+
it "should cleanup '#{job[:title]}'" do
|
66
|
+
simple.clean_title(job[:title]).should eq(job[:clean_title])
|
82
67
|
end
|
83
68
|
end
|
84
69
|
end
|
@@ -91,27 +76,27 @@ describe Preprocessor::Simple do
|
|
91
76
|
FactoryGirl.build(:job_description_w_gender) ]
|
92
77
|
}
|
93
78
|
it "should remove html/xml tags" do
|
94
|
-
desc = simple.clean_description(jobs[0]
|
79
|
+
desc = simple.clean_description(jobs[0][:description])
|
95
80
|
desc.should_not match(/<(.*?)>/)
|
96
81
|
end
|
97
82
|
it "should remove new lines" do
|
98
|
-
desc = simple.clean_description(jobs[0]
|
83
|
+
desc = simple.clean_description(jobs[0][:description])
|
99
84
|
desc.should_not match(/\r\n|\n|\r/)
|
100
85
|
end
|
101
86
|
it "should remove all special characters" do
|
102
|
-
desc = simple.clean_description(jobs[2]
|
87
|
+
desc = simple.clean_description(jobs[2][:description])
|
103
88
|
desc.should_not match(/[^a-z öäü]/i)
|
104
89
|
end
|
105
90
|
it "should remove gender tokens" do
|
106
|
-
desc = simple.clean_description(jobs[3]
|
91
|
+
desc = simple.clean_description(jobs[3][:description])
|
107
92
|
desc.should_not match(%r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)})
|
108
93
|
end
|
109
94
|
it "should remove job code token" do
|
110
|
-
desc = simple.clean_description(jobs[4]
|
95
|
+
desc = simple.clean_description(jobs[4][:description])
|
111
96
|
desc.should_not match(/\[.*\]|\(.*\)|\{.*\}|\d+\w+/)
|
112
97
|
end
|
113
98
|
it "should be downcased" do
|
114
|
-
desc = simple.clean_description(jobs[2]
|
99
|
+
desc = simple.clean_description(jobs[2][:description])
|
115
100
|
desc.should_not match(/[^a-z öäü]/)
|
116
101
|
end
|
117
102
|
end
|
@@ -125,13 +110,9 @@ describe Preprocessor::Simple do
|
|
125
110
|
FactoryGirl.build(:job_description_w_code_token),
|
126
111
|
FactoryGirl.build(:job_description_w_gender) ]
|
127
112
|
}
|
128
|
-
before(:each) do
|
129
|
-
jobs.each{|e| e.stubs(:classification_id)}
|
130
|
-
jobs.each{|e| e.stubs(:label)}
|
131
|
-
end
|
132
113
|
it "should be the same parallelized" do
|
133
|
-
single = simple.process(jobs
|
134
|
-
p_data = parallel.process(jobs
|
114
|
+
single = simple.process(jobs)
|
115
|
+
p_data = parallel.process(jobs)
|
135
116
|
single.each.with_index { |e,i| e.data.should == p_data[i].data }
|
136
117
|
end
|
137
118
|
end
|
@@ -3,7 +3,7 @@ require "spec_helper"
|
|
3
3
|
describe Selector::NGram do
|
4
4
|
it_behaves_like 'a selector'
|
5
5
|
|
6
|
-
let(:ngram) { Selector::NGram.new(gram_size: 3) }
|
6
|
+
let(:ngram) { Selector::NGram.new(:function, gram_size: 3) }
|
7
7
|
context "#extract_words_from_data" do
|
8
8
|
it "should generate a list of words from the data" do
|
9
9
|
words = ngram.extract_words_from_data(FactoryGirl.build(:data))
|
@@ -2,8 +2,8 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe Selector::Simple do
|
4
4
|
it_behaves_like 'a selector'
|
5
|
-
|
6
|
-
let(:simple) { Selector::Simple.new }
|
5
|
+
|
6
|
+
let(:simple) { Selector::Simple.new(:function) }
|
7
7
|
it "should have select_feature_vector implemented" do
|
8
8
|
expect { simple.generate_vectors([]) }.to_not raise_error
|
9
9
|
end
|
@@ -53,7 +53,8 @@ describe Selector::Simple do
|
|
53
53
|
context "#generate_vector" do
|
54
54
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
55
55
|
let(:data) { FactoryGirl.build(:data) }
|
56
|
-
let(:
|
56
|
+
let(:simple) { Selector::Simple.new(:career_level) }
|
57
|
+
let(:vector) { simple.generate_vector(data) }
|
57
58
|
|
58
59
|
before(:each) do
|
59
60
|
simple.stubs(:global_dictionary).returns(dictionary)
|
@@ -79,7 +80,7 @@ describe Selector::Simple do
|
|
79
80
|
end
|
80
81
|
context "custom dictionary" do
|
81
82
|
it "should accept a custom dictionary" do
|
82
|
-
vector = simple.generate_vector(data,
|
83
|
+
vector = simple.generate_vector(data, %w(pferd flasche glas))
|
83
84
|
vector.data.should eq([[1,0,0],[0,0,0,0,0,0,1,0]].flatten)
|
84
85
|
end
|
85
86
|
end
|
@@ -106,7 +107,7 @@ describe Selector::Simple do
|
|
106
107
|
simple.generate_vectors(data)
|
107
108
|
end
|
108
109
|
context "parallel" do
|
109
|
-
let(:parallel) { Selector::Simple.new(parallel: true) }
|
110
|
+
let(:parallel) { Selector::Simple.new(:function, parallel: true) }
|
110
111
|
before(:each) do
|
111
112
|
simple.stubs(:global_dictionary).returns(dictionary)
|
112
113
|
parallel.stubs(:global_dictionary).returns(dictionary)
|
@@ -2,11 +2,11 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe Selector::WithBinaryEncoding do
|
4
4
|
it_behaves_like 'a selector'
|
5
|
-
let(:simple) { Selector::WithBinaryEncoding.new }
|
5
|
+
let(:simple) { Selector::WithBinaryEncoding.new(:career_level) }
|
6
6
|
|
7
7
|
let(:dictionary) { %w(auto pferd haus hase garten) }
|
8
8
|
let(:data) { FactoryGirl.build(:data) }
|
9
|
-
let(:vector) { simple.generate_vector(data)
|
9
|
+
let(:vector) { simple.generate_vector(data) }
|
10
10
|
|
11
11
|
before(:each) do
|
12
12
|
simple.stubs(:global_dictionary).returns(dictionary)
|
@@ -32,7 +32,7 @@ describe Selector::WithBinaryEncoding do
|
|
32
32
|
end
|
33
33
|
context "custom dictionary" do
|
34
34
|
it "should accept a custom dictionary" do
|
35
|
-
vector = simple.generate_vector(data,
|
35
|
+
vector = simple.generate_vector(data, %w(pferd flasche glas))
|
36
36
|
vector.data.should eq([[1,0,0],[0,1,1,1]].flatten)
|
37
37
|
end
|
38
38
|
end
|
@@ -1,14 +1,10 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
shared_examples_for 'a preprocessor' do
|
4
|
-
let(:preprocessor) { described_class.new(
|
4
|
+
let(:preprocessor) { described_class.new(1423=>3, 523=>54) }
|
5
5
|
let(:job) { FactoryGirl.build(:job) }
|
6
6
|
let(:jobs) { [job] }
|
7
7
|
|
8
|
-
before(:each) do
|
9
|
-
job.stubs(:classification_id)
|
10
|
-
job.stubs(:label)
|
11
|
-
end
|
12
8
|
it { preprocessor.should respond_to :process }
|
13
9
|
it "should return a PreprocessedData object" do
|
14
10
|
preprocessor.process(job).should be_a(PreprocessedData)
|
metadata
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svm_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Andreas Eger
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-15 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: parallel
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
19
|
- - ~>
|
18
20
|
- !ruby/object:Gem::Version
|
@@ -20,6 +22,7 @@ dependencies:
|
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
27
|
- - ~>
|
25
28
|
- !ruby/object:Gem::Version
|
@@ -47,8 +50,8 @@ files:
|
|
47
50
|
- lib/svm_helper/interface_helper.rb
|
48
51
|
- lib/svm_helper/preprocessed_data.rb
|
49
52
|
- lib/svm_helper/preprocessors.rb
|
53
|
+
- lib/svm_helper/preprocessors/id_mapping.rb
|
50
54
|
- lib/svm_helper/preprocessors/simple.rb
|
51
|
-
- lib/svm_helper/preprocessors/with_industry_map.rb
|
52
55
|
- lib/svm_helper/selectors.rb
|
53
56
|
- lib/svm_helper/selectors/n_gram.rb
|
54
57
|
- lib/svm_helper/selectors/simple.rb
|
@@ -62,8 +65,8 @@ files:
|
|
62
65
|
- spec/factories/jobs/tmp3.html
|
63
66
|
- spec/factories/jobs_with_description.rb
|
64
67
|
- spec/factories/jobs_with_title.rb
|
68
|
+
- spec/preprocessors/id_mapping_spec.rb
|
65
69
|
- spec/preprocessors/simple_spec.rb
|
66
|
-
- spec/preprocessors/with_industry_map_spec.rb
|
67
70
|
- spec/selectors/n_gram_spec.rb
|
68
71
|
- spec/selectors/simple_spec.rb
|
69
72
|
- spec/selectors/with_binary_encoding_spec.rb
|
@@ -73,26 +76,33 @@ files:
|
|
73
76
|
- svm_helper.gemspec
|
74
77
|
homepage: https://github.com/sch1zo/svm_helper
|
75
78
|
licenses: []
|
76
|
-
metadata: {}
|
77
79
|
post_install_message:
|
78
80
|
rdoc_options: []
|
79
81
|
require_paths:
|
80
82
|
- lib
|
81
83
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
82
85
|
requirements:
|
83
86
|
- - '>='
|
84
87
|
- !ruby/object:Gem::Version
|
85
88
|
version: '0'
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
hash: 2037039748537332986
|
86
92
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
87
94
|
requirements:
|
88
95
|
- - '>='
|
89
96
|
- !ruby/object:Gem::Version
|
90
97
|
version: '0'
|
98
|
+
segments:
|
99
|
+
- 0
|
100
|
+
hash: 2037039748537332986
|
91
101
|
requirements: []
|
92
102
|
rubyforge_project:
|
93
|
-
rubygems_version:
|
103
|
+
rubygems_version: 1.8.25
|
94
104
|
signing_key:
|
95
|
-
specification_version:
|
105
|
+
specification_version: 3
|
96
106
|
summary: Preprocessor and Selector classes to generate FeatureVectors from Job data
|
97
107
|
test_files:
|
98
108
|
- spec/factories.rb
|
@@ -101,8 +111,8 @@ test_files:
|
|
101
111
|
- spec/factories/jobs/tmp3.html
|
102
112
|
- spec/factories/jobs_with_description.rb
|
103
113
|
- spec/factories/jobs_with_title.rb
|
114
|
+
- spec/preprocessors/id_mapping_spec.rb
|
104
115
|
- spec/preprocessors/simple_spec.rb
|
105
|
-
- spec/preprocessors/with_industry_map_spec.rb
|
106
116
|
- spec/selectors/n_gram_spec.rb
|
107
117
|
- spec/selectors/simple_spec.rb
|
108
118
|
- spec/selectors/with_binary_encoding_spec.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: bbfc13983f715f2f0ab66d2a9dbfed543896329e
|
4
|
-
data.tar.gz: 2e6f5c32898cc01a2468ebaa2e82470e01012586
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 8c616e62ff4717808e0b9f29e3d6773c03096934194a1fb706e3eb0ded7527c455ea00039e0a4d39fc08e13f735d4f49cd984d26128e624fbe0b956851893c21
|
7
|
-
data.tar.gz: f28756b0f3f9539f69bcda6ce3f79bdf3ef226ee813f0e5ed7b6f38595f1760e05bbd1abe7ec8ccc5c3a8db8b666871a0d33a755c19c4842a85a600e4749498d
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require_relative 'simple'
|
2
|
-
module Preprocessor
|
3
|
-
#
|
4
|
-
# Preprocessor Base Class
|
5
|
-
#
|
6
|
-
# @author Andreas Eger
|
7
|
-
#
|
8
|
-
class WithIndustryMap < Simple
|
9
|
-
attr_reader :industry_map
|
10
|
-
|
11
|
-
#
|
12
|
-
# @param args [Hash] options hash
|
13
|
-
# @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
|
14
|
-
def initialize args={}
|
15
|
-
@industry_map = args.fetch(:industry_map){ Hash[Pjpp::Industry.select(:id).all.map(&:id).sort.map.with_index{|e,i| [e,i]}] }
|
16
|
-
end
|
17
|
-
|
18
|
-
def map_industry_id(id)
|
19
|
-
@industry_map[id]
|
20
|
-
end
|
21
|
-
def label
|
22
|
-
"with_industry_map"
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
def process_job job, classification
|
27
|
-
PreprocessedData.new(
|
28
|
-
data: [ clean_title(job.title), clean_description(job.description) ],
|
29
|
-
ids: {
|
30
|
-
industry: map_industry_id(job.classification_id(:industry)),
|
31
|
-
function: job.classification_id(:function),
|
32
|
-
career_level: job.classification_id(:career_level) },
|
33
|
-
labels: {
|
34
|
-
industry: job.label(:industry),
|
35
|
-
function: job.label(:function),
|
36
|
-
career_level: job.label(:career_level) }
|
37
|
-
).tap{|e| e.send("#{classification}!")}
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Preprocessor::WithIndustryMap do
|
4
|
-
it_behaves_like 'a preprocessor'
|
5
|
-
let(:preprocessor) { Preprocessor::WithIndustryMap.new(industry_map: {1423=>3, 523=>54}) }
|
6
|
-
let(:job) { FactoryGirl.build(:job) }
|
7
|
-
let(:jobs) { [job] }
|
8
|
-
before(:each) do
|
9
|
-
job.stubs(:classification_id)
|
10
|
-
job.stubs(:label)
|
11
|
-
end
|
12
|
-
it "should make use of a industry_map" do
|
13
|
-
preprocessor.expects(:map_industry_id)
|
14
|
-
preprocessor.process(jobs)
|
15
|
-
end
|
16
|
-
end
|