svm_helper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +9 -0
  6. data/.versions.conf +4 -0
  7. data/.yardopts +3 -0
  8. data/Gemfile +24 -0
  9. data/Guardfile +17 -0
  10. data/LICENSE.txt +22 -0
  11. data/README.md +41 -0
  12. data/Rakefile +7 -0
  13. data/lib/svm_helper.rb +8 -0
  14. data/lib/svm_helper/feature_vector.rb +17 -0
  15. data/lib/svm_helper/interface_helper.rb +57 -0
  16. data/lib/svm_helper/preprocessed_data.rb +17 -0
  17. data/lib/svm_helper/preprocessors.rb +2 -0
  18. data/lib/svm_helper/preprocessors/simple.rb +111 -0
  19. data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
  20. data/lib/svm_helper/selectors.rb +3 -0
  21. data/lib/svm_helper/selectors/n_gram.rb +31 -0
  22. data/lib/svm_helper/selectors/simple.rb +163 -0
  23. data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
  24. data/lib/svm_helper/stopwords/de +127 -0
  25. data/lib/svm_helper/stopwords/en +119 -0
  26. data/lib/svm_helper/version.rb +3 -0
  27. data/spec/factories.rb +35 -0
  28. data/spec/factories/jobs/tmp.html +42 -0
  29. data/spec/factories/jobs/tmp2.html +20 -0
  30. data/spec/factories/jobs/tmp3.html +34 -0
  31. data/spec/factories/jobs_with_description.rb +20 -0
  32. data/spec/factories/jobs_with_title.rb +72 -0
  33. data/spec/preprocessors/simple_spec.rb +138 -0
  34. data/spec/preprocessors/with_industry_map_spec.rb +16 -0
  35. data/spec/selectors/n_gram_spec.rb +21 -0
  36. data/spec/selectors/simple_spec.rb +121 -0
  37. data/spec/selectors/with_binary_encoding_spec.rb +39 -0
  38. data/spec/spec_helper.rb +14 -0
  39. data/spec/support/preprocessor_spec.rb +21 -0
  40. data/spec/support/selector_spec.rb +21 -0
  41. data/svm_helper.gemspec +21 -0
  42. metadata +112 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bbfc13983f715f2f0ab66d2a9dbfed543896329e
4
+ data.tar.gz: 2e6f5c32898cc01a2468ebaa2e82470e01012586
5
+ SHA512:
6
+ metadata.gz: 8c616e62ff4717808e0b9f29e3d6773c03096934194a1fb706e3eb0ded7527c455ea00039e0a4d39fc08e13f735d4f49cd984d26128e624fbe0b956851893c21
7
+ data.tar.gz: f28756b0f3f9539f69bcda6ce3f79bdf3ef226ee813f0e5ed7b6f38595f1760e05bbd1abe7ec8ccc5c3a8db8b666871a0d33a755c19c4842a85a600e4749498d
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
19
+ .rbx
20
+
21
+ .tags
22
+ .tags_sorted_by_file
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --tty
2
+ --color
3
+ --format documentation
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.0.0
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ - 1.9.3
5
+ - 1.9.2
6
+ - jruby-19mode
7
+ - rbx-19mode
8
+ - ruby-head
9
+ - jruby-head
data/.versions.conf ADDED
@@ -0,0 +1,4 @@
1
+ ruby=ruby-2.0.0
2
+ ruby-gemset=svm_helper
3
+ #ruby-gem-install=bundler rake
4
+ #ruby-bundle-install=true
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ --no-private
2
+ --markup-provider=kramdown
3
+ --markup=markdown
data/Gemfile ADDED
@@ -0,0 +1,24 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in svm_helper.gemspec
4
+ gemspec
5
+
6
+ group :development do
7
+ gem 'yard'
8
+ gem 'kramdown'
9
+ gem 'github-markup'
10
+
11
+ gem 'pry'
12
+ gem 'guard-rspec'
13
+ gem 'guard-yard'
14
+
15
+ gem 'rb-inotify', '~> 0.9', :require => false
16
+ gem 'rb-fsevent', :require => false
17
+ gem 'rb-fchange', :require => false
18
+ end
19
+
20
+ group :test do
21
+ gem 'rake'
22
+ gem 'mocha', require: 'mocha/api'
23
+ gem 'factory_girl', '~> 4.0'
24
+ end
data/Guardfile ADDED
@@ -0,0 +1,17 @@
1
+ # guard 'rspec', cli: "--color --format p", all_after_pass: false do
2
+ guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
3
+ watch(%r{^spec/.+_spec\.rb$})
4
+ watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
5
+ watch('spec/spec_helper.rb') { 'spec' }
6
+ watch('spec/factories.rb') { 'spec' }
7
+ watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
8
+ watch(%r{^spec/support/(.+)_spec\.rb}) { |m| "spec/#{m[1]}s/*" }
9
+ end
10
+
11
+ notification :tmux,
12
+ :display_message => true,
13
+ :timeout => 3 # in seconds
14
+
15
+ # guard 'yard' do
16
+ # watch(%r{lib/.+\.rb})
17
+ # end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Andreas Eger
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # SvmHelper
2
+
3
+ Shared helper classes for usage in context of SVM at experteer
4
+
5
+ [![Build Status](https://travis-ci.org/sch1zo/svm_helper.png?branch=master)](https://travis-ci.org/sch1zo/svm_helper)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'svm_helper'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install svm_helper
20
+
21
+ ## Usage
22
+
23
+ Dataflow is normally something like this:
24
+
25
+ Job --Preprocessor--> PreprocessedData --Selector--> FeatureVector
26
+
27
+ The FeatureVector can now be used for training or prediction in a (libsvm) SVM.
28
+
29
+ Be aware that a FeatureVector has two Attributes:
30
+
31
+ data: the feature array itself
32
+ label: 1 for true, 0 for false
33
+
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
data/lib/svm_helper.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "svm_helper/version"
2
+ require 'parallel'
3
+
4
+ require "svm_helper/preprocessed_data"
5
+ require "svm_helper/feature_vector"
6
+ require "svm_helper/preprocessors"
7
+ require "svm_helper/selectors"
8
+
@@ -0,0 +1,17 @@
1
+ require_relative 'interface_helper'
2
+ #
3
+ # FeatureVector interface
4
+ #
5
+ # @author Andreas Eger
6
+ class FeatureVector < InterfaceHelper
7
+ attribute :word_data
8
+ attribute :classification_arrays
9
+ attribute :labels
10
+
11
+ def label
12
+ labels[classification]
13
+ end
14
+ def data
15
+ word_data + classification_arrays[classification]
16
+ end
17
+ end
@@ -0,0 +1,57 @@
1
+ #
2
+ # @abstract Subclass and define attributes
3
+ #
4
+ # @author Andreas Eger
5
+ class InterfaceHelper
6
+ @@_attributes = Hash.new { |hash, key| hash[key] = [] }
7
+
8
+ #
9
+ # creates setter/getter similar to attr_accesor
10
+ # @param name [Symbol]
11
+ # @macro [attach] attribute
12
+ # @method $1
13
+ # reads $1
14
+ # @method $1=
15
+ # saves $1
16
+ def self.attribute name
17
+ define_method(name) do
18
+ @_attributes[name]
19
+ end
20
+ define_method(:"#{name}=") do |v|
21
+ @_attributes[name] = v
22
+ end
23
+ attributes << name unless attributes.include? name
24
+ end
25
+ def self.attributes
26
+ @@_attributes[self]
27
+ end
28
+ def initialize(params={})
29
+ @_attributes = {}
30
+ params.each do |key, value|
31
+ send("#{key}=", value)
32
+ end
33
+ @_attributes[:classification] ||= :function
34
+ end
35
+
36
+ #
37
+ # custom comperator
38
+ # @param anOther [InterfaceHelper]
39
+ #
40
+ # @return [Boolean] result after comparing each attribute
41
+ def == anOther
42
+ @_attributes.keys.map{ |sym| self.send(sym) == anOther.send(sym)}.reduce(true){|a,e| a && e }
43
+ end
44
+
45
+ def industry!
46
+ @_attributes[:classification] = :industry
47
+ end
48
+ def function!
49
+ @_attributes[:classification] = :function
50
+ end
51
+ def career_level!
52
+ @_attributes[:classification] = :career_level
53
+ end
54
+ def classification
55
+ @_attributes[:classification]
56
+ end
57
+ end
@@ -0,0 +1,17 @@
1
+ require_relative 'interface_helper'
2
+ #
3
+ # PreprocessedData interface
4
+ #
5
+ # @author Andreas Eger
6
+ class PreprocessedData < InterfaceHelper
7
+ attribute :data
8
+ attribute :ids
9
+ attribute :labels
10
+
11
+ def id
12
+ ids[classification]
13
+ end
14
+ def label
15
+ labels[classification]
16
+ end
17
+ end
@@ -0,0 +1,2 @@
1
+ require_relative 'preprocessors/simple'
2
+ require_relative 'preprocessors/with_industry_map'
@@ -0,0 +1,111 @@
1
+ # encoding: UTF-8
2
+ module Preprocessor
3
+ #
4
+ # Preprocessor which just cleans to text
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class Simple
9
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
10
+ # filters most gender stuff
11
+ GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
12
+ # filters most wierd symbols
13
+ SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)}
14
+ # urls and email filter
15
+ URL_FILTER = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/
16
+ EMAIL_FILTER = /([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})/
17
+ # filter for new lines
18
+ NEW_LINES = /(\r\n)|\r|\n/
19
+ # extract words from brackets
20
+ WORDS_IN_BRACKETS = /\(([a-zA-Z]+)\)/
21
+ # filters multiple whitesspace
22
+ WHITESPACE = /(\s| )+/
23
+ # filters all kind of XMl/HTML tags
24
+ XML_TAG_FILTER = /<(.*?)>/
25
+ # filter for used job tokens
26
+ CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
27
+
28
+ def initialize args={}
29
+ @parallel = args.fetch(:parallel){false}
30
+ end
31
+
32
+ def label
33
+ "simple"
34
+ end
35
+ #
36
+ # cleans provided jobs
37
+ # @overload process(jobs, classification)
38
+ # @param jobs [Job] single Job
39
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
40
+ # @overload process(jobs, classification)
41
+ # @param jobs [Array<Job>] list of Jobs
42
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
43
+ #
44
+ # @return [Array<PreprocessedData>] list of processed job data - or singe job data
45
+ def process jobs, classification=:function
46
+ if jobs.respond_to? :map
47
+ process_jobs jobs, classification
48
+ else
49
+ process_job jobs, classification
50
+ end
51
+ end
52
+
53
+ #
54
+ # converts string into a cleaner version
55
+ # @param title [String] job title
56
+ #
57
+ # @return [String] clean and lowercase version of input
58
+ def clean_title title
59
+ title.gsub(GENDER_FILTER,'').
60
+ gsub(SYMBOL_FILTER,'').
61
+ gsub(WORDS_IN_BRACKETS, '\1').
62
+ gsub(CODE_TOKEN_FILTER,'').
63
+ gsub(WHITESPACE,' ').
64
+ downcase.
65
+ strip
66
+ end
67
+ #
68
+ # converts string into a cleaner version
69
+ # @param desc [String] job description
70
+ #
71
+ # @return [String] clean and lowercase version of input
72
+ def clean_description desc
73
+ desc.gsub(XML_TAG_FILTER,' ')
74
+ .gsub(EMAIL_FILTER,'')
75
+ .gsub(URL_FILTER,'')
76
+ .gsub(GENDER_FILTER,'')
77
+ .gsub(NEW_LINES,'')
78
+ .gsub(SYMBOL_FILTER,' ')
79
+ .gsub(WHITESPACE,' ')
80
+ .gsub(WORDS_IN_BRACKETS, '\1')
81
+ .gsub(CODE_TOKEN_FILTER,'')
82
+ .downcase
83
+ .strip
84
+ end
85
+
86
+ private
87
+ def process_jobs jobs, classification
88
+ if @parallel && RUBY_PLATFORM == 'java'
89
+ Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job, classification }
90
+ elsif @parallel
91
+ Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job, classification }
92
+ else
93
+ jobs.map {|job| process_job job, classification }
94
+ end
95
+ end
96
+
97
+ def process_job job, classification
98
+ PreprocessedData.new(
99
+ data: [ clean_title(job.title), clean_description(job.description) ],
100
+ ids: {
101
+ industry: job.classification_id(:industry),
102
+ function: job.classification_id(:function),
103
+ career_level: job.classification_id(:career_level) },
104
+ labels: {
105
+ industry: job.label(:industry),
106
+ function: job.label(:function),
107
+ career_level: job.label(:career_level) }
108
+ ).tap{|e| e.send("#{classification}!")}
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'simple'
2
+ module Preprocessor
3
+ #
4
+ # Preprocessor Base Class
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class WithIndustryMap < Simple
9
+ attr_reader :industry_map
10
+
11
+ #
12
+ # @param args [Hash] options hash
13
+ # @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
14
+ def initialize args={}
15
+ @industry_map = args.fetch(:industry_map){ Hash[Pjpp::Industry.select(:id).all.map(&:id).sort.map.with_index{|e,i| [e,i]}] }
16
+ end
17
+
18
+ def map_industry_id(id)
19
+ @industry_map[id]
20
+ end
21
+ def label
22
+ "with_industry_map"
23
+ end
24
+
25
+ private
26
+ def process_job job, classification
27
+ PreprocessedData.new(
28
+ data: [ clean_title(job.title), clean_description(job.description) ],
29
+ ids: {
30
+ industry: map_industry_id(job.classification_id(:industry)),
31
+ function: job.classification_id(:function),
32
+ career_level: job.classification_id(:career_level) },
33
+ labels: {
34
+ industry: job.label(:industry),
35
+ function: job.label(:function),
36
+ career_level: job.label(:career_level) }
37
+ ).tap{|e| e.send("#{classification}!")}
38
+ end
39
+ end
40
+ end