svm_helper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/.versions.conf +4 -0
- data/.yardopts +3 -0
- data/Gemfile +24 -0
- data/Guardfile +17 -0
- data/LICENSE.txt +22 -0
- data/README.md +41 -0
- data/Rakefile +7 -0
- data/lib/svm_helper.rb +8 -0
- data/lib/svm_helper/feature_vector.rb +17 -0
- data/lib/svm_helper/interface_helper.rb +57 -0
- data/lib/svm_helper/preprocessed_data.rb +17 -0
- data/lib/svm_helper/preprocessors.rb +2 -0
- data/lib/svm_helper/preprocessors/simple.rb +111 -0
- data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
- data/lib/svm_helper/selectors.rb +3 -0
- data/lib/svm_helper/selectors/n_gram.rb +31 -0
- data/lib/svm_helper/selectors/simple.rb +163 -0
- data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
- data/lib/svm_helper/stopwords/de +127 -0
- data/lib/svm_helper/stopwords/en +119 -0
- data/lib/svm_helper/version.rb +3 -0
- data/spec/factories.rb +35 -0
- data/spec/factories/jobs/tmp.html +42 -0
- data/spec/factories/jobs/tmp2.html +20 -0
- data/spec/factories/jobs/tmp3.html +34 -0
- data/spec/factories/jobs_with_description.rb +20 -0
- data/spec/factories/jobs_with_title.rb +72 -0
- data/spec/preprocessors/simple_spec.rb +138 -0
- data/spec/preprocessors/with_industry_map_spec.rb +16 -0
- data/spec/selectors/n_gram_spec.rb +21 -0
- data/spec/selectors/simple_spec.rb +121 -0
- data/spec/selectors/with_binary_encoding_spec.rb +39 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/preprocessor_spec.rb +21 -0
- data/spec/support/selector_spec.rb +21 -0
- data/svm_helper.gemspec +21 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bbfc13983f715f2f0ab66d2a9dbfed543896329e
|
4
|
+
data.tar.gz: 2e6f5c32898cc01a2468ebaa2e82470e01012586
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8c616e62ff4717808e0b9f29e3d6773c03096934194a1fb706e3eb0ded7527c455ea00039e0a4d39fc08e13f735d4f49cd984d26128e624fbe0b956851893c21
|
7
|
+
data.tar.gz: f28756b0f3f9539f69bcda6ce3f79bdf3ef226ee813f0e5ed7b6f38595f1760e05bbd1abe7ec8ccc5c3a8db8b666871a0d33a755c19c4842a85a600e4749498d
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
|
19
|
+
.rbx
|
20
|
+
|
21
|
+
.tags
|
22
|
+
.tags_sorted_by_file
|
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.0.0
|
data/.travis.yml
ADDED
data/.versions.conf
ADDED
data/.yardopts
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in svm_helper.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
group :development do
|
7
|
+
gem 'yard'
|
8
|
+
gem 'kramdown'
|
9
|
+
gem 'github-markup'
|
10
|
+
|
11
|
+
gem 'pry'
|
12
|
+
gem 'guard-rspec'
|
13
|
+
gem 'guard-yard'
|
14
|
+
|
15
|
+
gem 'rb-inotify', '~> 0.9', :require => false
|
16
|
+
gem 'rb-fsevent', :require => false
|
17
|
+
gem 'rb-fchange', :require => false
|
18
|
+
end
|
19
|
+
|
20
|
+
group :test do
|
21
|
+
gem 'rake'
|
22
|
+
gem 'mocha', require: 'mocha/api'
|
23
|
+
gem 'factory_girl', '~> 4.0'
|
24
|
+
end
|
data/Guardfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# guard 'rspec', cli: "--color --format p", all_after_pass: false do
|
2
|
+
guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
|
3
|
+
watch(%r{^spec/.+_spec\.rb$})
|
4
|
+
watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
5
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
6
|
+
watch('spec/factories.rb') { 'spec' }
|
7
|
+
watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
|
8
|
+
watch(%r{^spec/support/(.+)_spec\.rb}) { |m| "spec/#{m[1]}s/*" }
|
9
|
+
end
|
10
|
+
|
11
|
+
notification :tmux,
|
12
|
+
:display_message => true,
|
13
|
+
:timeout => 3 # in seconds
|
14
|
+
|
15
|
+
# guard 'yard' do
|
16
|
+
# watch(%r{lib/.+\.rb})
|
17
|
+
# end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Andreas Eger
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# SvmHelper
|
2
|
+
|
3
|
+
Shared helper classes for usage in context of SVM at experteer
|
4
|
+
|
5
|
+
[![Build Status](https://travis-ci.org/sch1zo/svm_helper.png?branch=master)](https://travis-ci.org/sch1zo/svm_helper)
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'svm_helper'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install svm_helper
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Dataflow is normally something like this:
|
24
|
+
|
25
|
+
Job --Preprocessor--> PreprocessedData --Selector--> FeatureVector
|
26
|
+
|
27
|
+
The FeatureVector can now be used for training or prediction in a (libsvm) SVM.
|
28
|
+
|
29
|
+
Be aware that a FeatureVector has two Attributes:
|
30
|
+
|
31
|
+
data: the feature array itself
|
32
|
+
label: 1 for true, 0 for false
|
33
|
+
|
34
|
+
|
35
|
+
## Contributing
|
36
|
+
|
37
|
+
1. Fork it
|
38
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
39
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
40
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
41
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/svm_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative 'interface_helper'
|
2
|
+
#
|
3
|
+
# FeatureVector interface
|
4
|
+
#
|
5
|
+
# @author Andreas Eger
|
6
|
+
class FeatureVector < InterfaceHelper
|
7
|
+
attribute :word_data
|
8
|
+
attribute :classification_arrays
|
9
|
+
attribute :labels
|
10
|
+
|
11
|
+
def label
|
12
|
+
labels[classification]
|
13
|
+
end
|
14
|
+
def data
|
15
|
+
word_data + classification_arrays[classification]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#
|
2
|
+
# @abstract Subclass and define attributes
|
3
|
+
#
|
4
|
+
# @author Andreas Eger
|
5
|
+
class InterfaceHelper
|
6
|
+
@@_attributes = Hash.new { |hash, key| hash[key] = [] }
|
7
|
+
|
8
|
+
#
|
9
|
+
# creates setter/getter similar to attr_accesor
|
10
|
+
# @param name [Symbol]
|
11
|
+
# @macro [attach] attribute
|
12
|
+
# @method $1
|
13
|
+
# reads $1
|
14
|
+
# @method $1=
|
15
|
+
# saves $1
|
16
|
+
def self.attribute name
|
17
|
+
define_method(name) do
|
18
|
+
@_attributes[name]
|
19
|
+
end
|
20
|
+
define_method(:"#{name}=") do |v|
|
21
|
+
@_attributes[name] = v
|
22
|
+
end
|
23
|
+
attributes << name unless attributes.include? name
|
24
|
+
end
|
25
|
+
def self.attributes
|
26
|
+
@@_attributes[self]
|
27
|
+
end
|
28
|
+
def initialize(params={})
|
29
|
+
@_attributes = {}
|
30
|
+
params.each do |key, value|
|
31
|
+
send("#{key}=", value)
|
32
|
+
end
|
33
|
+
@_attributes[:classification] ||= :function
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# custom comperator
|
38
|
+
# @param anOther [InterfaceHelper]
|
39
|
+
#
|
40
|
+
# @return [Boolean] result after comparing each attribute
|
41
|
+
def == anOther
|
42
|
+
@_attributes.keys.map{ |sym| self.send(sym) == anOther.send(sym)}.reduce(true){|a,e| a && e }
|
43
|
+
end
|
44
|
+
|
45
|
+
def industry!
|
46
|
+
@_attributes[:classification] = :industry
|
47
|
+
end
|
48
|
+
def function!
|
49
|
+
@_attributes[:classification] = :function
|
50
|
+
end
|
51
|
+
def career_level!
|
52
|
+
@_attributes[:classification] = :career_level
|
53
|
+
end
|
54
|
+
def classification
|
55
|
+
@_attributes[:classification]
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative 'interface_helper'
|
2
|
+
#
|
3
|
+
# PreprocessedData interface
|
4
|
+
#
|
5
|
+
# @author Andreas Eger
|
6
|
+
class PreprocessedData < InterfaceHelper
|
7
|
+
attribute :data
|
8
|
+
attribute :ids
|
9
|
+
attribute :labels
|
10
|
+
|
11
|
+
def id
|
12
|
+
ids[classification]
|
13
|
+
end
|
14
|
+
def label
|
15
|
+
labels[classification]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Preprocessor
|
3
|
+
#
|
4
|
+
# Preprocessor which just cleans to text
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class Simple
|
9
|
+
THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
|
10
|
+
# filters most gender stuff
|
11
|
+
GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
|
12
|
+
# filters most wierd symbols
|
13
|
+
SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)}
|
14
|
+
# urls and email filter
|
15
|
+
URL_FILTER = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/
|
16
|
+
EMAIL_FILTER = /([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})/
|
17
|
+
# filter for new lines
|
18
|
+
NEW_LINES = /(\r\n)|\r|\n/
|
19
|
+
# extract words from brackets
|
20
|
+
WORDS_IN_BRACKETS = /\(([a-zA-Z]+)\)/
|
21
|
+
# filters multiple whitesspace
|
22
|
+
WHITESPACE = /(\s| )+/
|
23
|
+
# filters all kind of XMl/HTML tags
|
24
|
+
XML_TAG_FILTER = /<(.*?)>/
|
25
|
+
# filter for used job tokens
|
26
|
+
CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
|
27
|
+
|
28
|
+
def initialize args={}
|
29
|
+
@parallel = args.fetch(:parallel){false}
|
30
|
+
end
|
31
|
+
|
32
|
+
def label
|
33
|
+
"simple"
|
34
|
+
end
|
35
|
+
#
|
36
|
+
# cleans provided jobs
|
37
|
+
# @overload process(jobs, classification)
|
38
|
+
# @param jobs [Job] single Job
|
39
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
40
|
+
# @overload process(jobs, classification)
|
41
|
+
# @param jobs [Array<Job>] list of Jobs
|
42
|
+
# @param classification [Symbol] in `:industry`, `:function`, `:career_level`
|
43
|
+
#
|
44
|
+
# @return [Array<PreprocessedData>] list of processed job data - or singe job data
|
45
|
+
def process jobs, classification=:function
|
46
|
+
if jobs.respond_to? :map
|
47
|
+
process_jobs jobs, classification
|
48
|
+
else
|
49
|
+
process_job jobs, classification
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# converts string into a cleaner version
|
55
|
+
# @param title [String] job title
|
56
|
+
#
|
57
|
+
# @return [String] clean and lowercase version of input
|
58
|
+
def clean_title title
|
59
|
+
title.gsub(GENDER_FILTER,'').
|
60
|
+
gsub(SYMBOL_FILTER,'').
|
61
|
+
gsub(WORDS_IN_BRACKETS, '\1').
|
62
|
+
gsub(CODE_TOKEN_FILTER,'').
|
63
|
+
gsub(WHITESPACE,' ').
|
64
|
+
downcase.
|
65
|
+
strip
|
66
|
+
end
|
67
|
+
#
|
68
|
+
# converts string into a cleaner version
|
69
|
+
# @param desc [String] job description
|
70
|
+
#
|
71
|
+
# @return [String] clean and lowercase version of input
|
72
|
+
def clean_description desc
|
73
|
+
desc.gsub(XML_TAG_FILTER,' ')
|
74
|
+
.gsub(EMAIL_FILTER,'')
|
75
|
+
.gsub(URL_FILTER,'')
|
76
|
+
.gsub(GENDER_FILTER,'')
|
77
|
+
.gsub(NEW_LINES,'')
|
78
|
+
.gsub(SYMBOL_FILTER,' ')
|
79
|
+
.gsub(WHITESPACE,' ')
|
80
|
+
.gsub(WORDS_IN_BRACKETS, '\1')
|
81
|
+
.gsub(CODE_TOKEN_FILTER,'')
|
82
|
+
.downcase
|
83
|
+
.strip
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
def process_jobs jobs, classification
|
88
|
+
if @parallel && RUBY_PLATFORM == 'java'
|
89
|
+
Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job, classification }
|
90
|
+
elsif @parallel
|
91
|
+
Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job, classification }
|
92
|
+
else
|
93
|
+
jobs.map {|job| process_job job, classification }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def process_job job, classification
|
98
|
+
PreprocessedData.new(
|
99
|
+
data: [ clean_title(job.title), clean_description(job.description) ],
|
100
|
+
ids: {
|
101
|
+
industry: job.classification_id(:industry),
|
102
|
+
function: job.classification_id(:function),
|
103
|
+
career_level: job.classification_id(:career_level) },
|
104
|
+
labels: {
|
105
|
+
industry: job.label(:industry),
|
106
|
+
function: job.label(:function),
|
107
|
+
career_level: job.label(:career_level) }
|
108
|
+
).tap{|e| e.send("#{classification}!")}
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative 'simple'
|
2
|
+
module Preprocessor
|
3
|
+
#
|
4
|
+
# Preprocessor Base Class
|
5
|
+
#
|
6
|
+
# @author Andreas Eger
|
7
|
+
#
|
8
|
+
class WithIndustryMap < Simple
|
9
|
+
attr_reader :industry_map
|
10
|
+
|
11
|
+
#
|
12
|
+
# @param args [Hash] options hash
|
13
|
+
# @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
|
14
|
+
def initialize args={}
|
15
|
+
@industry_map = args.fetch(:industry_map){ Hash[Pjpp::Industry.select(:id).all.map(&:id).sort.map.with_index{|e,i| [e,i]}] }
|
16
|
+
end
|
17
|
+
|
18
|
+
def map_industry_id(id)
|
19
|
+
@industry_map[id]
|
20
|
+
end
|
21
|
+
def label
|
22
|
+
"with_industry_map"
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
def process_job job, classification
|
27
|
+
PreprocessedData.new(
|
28
|
+
data: [ clean_title(job.title), clean_description(job.description) ],
|
29
|
+
ids: {
|
30
|
+
industry: map_industry_id(job.classification_id(:industry)),
|
31
|
+
function: job.classification_id(:function),
|
32
|
+
career_level: job.classification_id(:career_level) },
|
33
|
+
labels: {
|
34
|
+
industry: job.label(:industry),
|
35
|
+
function: job.label(:function),
|
36
|
+
career_level: job.label(:career_level) }
|
37
|
+
).tap{|e| e.send("#{classification}!")}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|