svm_helper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +9 -0
  6. data/.versions.conf +4 -0
  7. data/.yardopts +3 -0
  8. data/Gemfile +24 -0
  9. data/Guardfile +17 -0
  10. data/LICENSE.txt +22 -0
  11. data/README.md +41 -0
  12. data/Rakefile +7 -0
  13. data/lib/svm_helper.rb +8 -0
  14. data/lib/svm_helper/feature_vector.rb +17 -0
  15. data/lib/svm_helper/interface_helper.rb +57 -0
  16. data/lib/svm_helper/preprocessed_data.rb +17 -0
  17. data/lib/svm_helper/preprocessors.rb +2 -0
  18. data/lib/svm_helper/preprocessors/simple.rb +111 -0
  19. data/lib/svm_helper/preprocessors/with_industry_map.rb +40 -0
  20. data/lib/svm_helper/selectors.rb +3 -0
  21. data/lib/svm_helper/selectors/n_gram.rb +31 -0
  22. data/lib/svm_helper/selectors/simple.rb +163 -0
  23. data/lib/svm_helper/selectors/with_binary_encoding.rb +42 -0
  24. data/lib/svm_helper/stopwords/de +127 -0
  25. data/lib/svm_helper/stopwords/en +119 -0
  26. data/lib/svm_helper/version.rb +3 -0
  27. data/spec/factories.rb +35 -0
  28. data/spec/factories/jobs/tmp.html +42 -0
  29. data/spec/factories/jobs/tmp2.html +20 -0
  30. data/spec/factories/jobs/tmp3.html +34 -0
  31. data/spec/factories/jobs_with_description.rb +20 -0
  32. data/spec/factories/jobs_with_title.rb +72 -0
  33. data/spec/preprocessors/simple_spec.rb +138 -0
  34. data/spec/preprocessors/with_industry_map_spec.rb +16 -0
  35. data/spec/selectors/n_gram_spec.rb +21 -0
  36. data/spec/selectors/simple_spec.rb +121 -0
  37. data/spec/selectors/with_binary_encoding_spec.rb +39 -0
  38. data/spec/spec_helper.rb +14 -0
  39. data/spec/support/preprocessor_spec.rb +21 -0
  40. data/spec/support/selector_spec.rb +21 -0
  41. data/svm_helper.gemspec +21 -0
  42. metadata +112 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bbfc13983f715f2f0ab66d2a9dbfed543896329e
4
+ data.tar.gz: 2e6f5c32898cc01a2468ebaa2e82470e01012586
5
+ SHA512:
6
+ metadata.gz: 8c616e62ff4717808e0b9f29e3d6773c03096934194a1fb706e3eb0ded7527c455ea00039e0a4d39fc08e13f735d4f49cd984d26128e624fbe0b956851893c21
7
+ data.tar.gz: f28756b0f3f9539f69bcda6ce3f79bdf3ef226ee813f0e5ed7b6f38595f1760e05bbd1abe7ec8ccc5c3a8db8b666871a0d33a755c19c4842a85a600e4749498d
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
19
+ .rbx
20
+
21
+ .tags
22
+ .tags_sorted_by_file
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --tty
2
+ --color
3
+ --format documentation
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.0.0
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ - 1.9.3
5
+ - 1.9.2
6
+ - jruby-19mode
7
+ - rbx-19mode
8
+ - ruby-head
9
+ - jruby-head
data/.versions.conf ADDED
@@ -0,0 +1,4 @@
1
+ ruby=ruby-2.0.0
2
+ ruby-gemset=svm_helper
3
+ #ruby-gem-install=bundler rake
4
+ #ruby-bundle-install=true
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ --no-private
2
+ --markup-provider=kramdown
3
+ --markup=markdown
data/Gemfile ADDED
@@ -0,0 +1,24 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in svm_helper.gemspec
4
+ gemspec
5
+
6
+ group :development do
7
+ gem 'yard'
8
+ gem 'kramdown'
9
+ gem 'github-markup'
10
+
11
+ gem 'pry'
12
+ gem 'guard-rspec'
13
+ gem 'guard-yard'
14
+
15
+ gem 'rb-inotify', '~> 0.9', :require => false
16
+ gem 'rb-fsevent', :require => false
17
+ gem 'rb-fchange', :require => false
18
+ end
19
+
20
+ group :test do
21
+ gem 'rake'
22
+ gem 'mocha', require: 'mocha/api'
23
+ gem 'factory_girl', '~> 4.0'
24
+ end
data/Guardfile ADDED
@@ -0,0 +1,17 @@
1
+ # guard 'rspec', cli: "--color --format p", all_after_pass: false do
2
+ guard 'rspec', cli: "--color --format p", all_after_pass: false, rvm:['2.0.0@svm_helper', 'jruby@svm_helper'] do
3
+ watch(%r{^spec/.+_spec\.rb$})
4
+ watch(%r{^lib/svm_helper/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
5
+ watch('spec/spec_helper.rb') { 'spec' }
6
+ watch('spec/factories.rb') { 'spec' }
7
+ watch(%r{^spec/factories/(.+)\.rb}) { 'spec' }
8
+ watch(%r{^spec/support/(.+)_spec\.rb}) { |m| "spec/#{m[1]}s/*" }
9
+ end
10
+
11
+ notification :tmux,
12
+ :display_message => true,
13
+ :timeout => 3 # in seconds
14
+
15
+ # guard 'yard' do
16
+ # watch(%r{lib/.+\.rb})
17
+ # end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Andreas Eger
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # SvmHelper
2
+
3
+ Shared helper classes for usage in context of SVM at experteer
4
+
5
+ [![Build Status](https://travis-ci.org/sch1zo/svm_helper.png?branch=master)](https://travis-ci.org/sch1zo/svm_helper)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'svm_helper'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install svm_helper
20
+
21
+ ## Usage
22
+
23
+ Dataflow is normally something like this:
24
+
25
+ Job --Preprocessor--> PreprocessedData --Selector--> FeatureVector
26
+
27
+ The FeatureVector can now be used for training or prediction in a (libsvm) SVM.
28
+
29
+ Be aware that a FeatureVector has two Attributes:
30
+
31
+ data: the feature array itself
32
+ label: 1 for true, 0 for false
33
+
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
data/lib/svm_helper.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "svm_helper/version"
2
+ require 'parallel'
3
+
4
+ require "svm_helper/preprocessed_data"
5
+ require "svm_helper/feature_vector"
6
+ require "svm_helper/preprocessors"
7
+ require "svm_helper/selectors"
8
+
@@ -0,0 +1,17 @@
1
+ require_relative 'interface_helper'
2
+ #
3
+ # FeatureVector interface
4
+ #
5
+ # @author Andreas Eger
6
+ class FeatureVector < InterfaceHelper
7
+ attribute :word_data
8
+ attribute :classification_arrays
9
+ attribute :labels
10
+
11
+ def label
12
+ labels[classification]
13
+ end
14
+ def data
15
+ word_data + classification_arrays[classification]
16
+ end
17
+ end
@@ -0,0 +1,57 @@
1
+ #
2
+ # @abstract Subclass and define attributes
3
+ #
4
+ # @author Andreas Eger
5
+ class InterfaceHelper
6
+ @@_attributes = Hash.new { |hash, key| hash[key] = [] }
7
+
8
+ #
9
+ # creates setter/getter similar to attr_accesor
10
+ # @param name [Symbol]
11
+ # @macro [attach] attribute
12
+ # @method $1
13
+ # reads $1
14
+ # @method $1=
15
+ # saves $1
16
+ def self.attribute name
17
+ define_method(name) do
18
+ @_attributes[name]
19
+ end
20
+ define_method(:"#{name}=") do |v|
21
+ @_attributes[name] = v
22
+ end
23
+ attributes << name unless attributes.include? name
24
+ end
25
+ def self.attributes
26
+ @@_attributes[self]
27
+ end
28
+ def initialize(params={})
29
+ @_attributes = {}
30
+ params.each do |key, value|
31
+ send("#{key}=", value)
32
+ end
33
+ @_attributes[:classification] ||= :function
34
+ end
35
+
36
+ #
37
+ # custom comperator
38
+ # @param anOther [InterfaceHelper]
39
+ #
40
+ # @return [Boolean] result after comparing each attribute
41
+ def == anOther
42
+ @_attributes.keys.map{ |sym| self.send(sym) == anOther.send(sym)}.reduce(true){|a,e| a && e }
43
+ end
44
+
45
+ def industry!
46
+ @_attributes[:classification] = :industry
47
+ end
48
+ def function!
49
+ @_attributes[:classification] = :function
50
+ end
51
+ def career_level!
52
+ @_attributes[:classification] = :career_level
53
+ end
54
+ def classification
55
+ @_attributes[:classification]
56
+ end
57
+ end
@@ -0,0 +1,17 @@
1
+ require_relative 'interface_helper'
2
+ #
3
+ # PreprocessedData interface
4
+ #
5
+ # @author Andreas Eger
6
+ class PreprocessedData < InterfaceHelper
7
+ attribute :data
8
+ attribute :ids
9
+ attribute :labels
10
+
11
+ def id
12
+ ids[classification]
13
+ end
14
+ def label
15
+ labels[classification]
16
+ end
17
+ end
@@ -0,0 +1,2 @@
1
+ require_relative 'preprocessors/simple'
2
+ require_relative 'preprocessors/with_industry_map'
@@ -0,0 +1,111 @@
1
+ # encoding: UTF-8
2
+ module Preprocessor
3
+ #
4
+ # Preprocessor which just cleans to text
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class Simple
9
+ THREAD_COUNT = (ENV['OMP_NUM_THREADS'] || 2).to_i
10
+ # filters most gender stuff
11
+ GENDER_FILTER = %r{(\(*(m|w)(\/|\|)(w|m)\)*)|(/-*in)|\(in\)}
12
+ # filters most wierd symbols
13
+ SYMBOL_FILTER = %r{/|-|–|:|\+|!|,|\.|\*|\?|/|·|\"|„|•||\||(\S*(&|;)\S*)}
14
+ # urls and email filter
15
+ URL_FILTER = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/
16
+ EMAIL_FILTER = /([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})/
17
+ # filter for new lines
18
+ NEW_LINES = /(\r\n)|\r|\n/
19
+ # extract words from brackets
20
+ WORDS_IN_BRACKETS = /\(([a-zA-Z]+)\)/
21
+ # filters multiple whitesspace
22
+ WHITESPACE = /(\s| )+/
23
+ # filters all kind of XMl/HTML tags
24
+ XML_TAG_FILTER = /<(.*?)>/
25
+ # filter for used job tokens
26
+ CODE_TOKEN_FILTER = /\[[^\]]*\]|\([^\)]*\)|\{[^\}]*\}|\S*\d+\w+/
27
+
28
+ def initialize args={}
29
+ @parallel = args.fetch(:parallel){false}
30
+ end
31
+
32
+ def label
33
+ "simple"
34
+ end
35
+ #
36
+ # cleans provided jobs
37
+ # @overload process(jobs, classification)
38
+ # @param jobs [Job] single Job
39
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
40
+ # @overload process(jobs, classification)
41
+ # @param jobs [Array<Job>] list of Jobs
42
+ # @param classification [Symbol] in `:industry`, `:function`, `:career_level`
43
+ #
44
+ # @return [Array<PreprocessedData>] list of processed job data - or singe job data
45
+ def process jobs, classification=:function
46
+ if jobs.respond_to? :map
47
+ process_jobs jobs, classification
48
+ else
49
+ process_job jobs, classification
50
+ end
51
+ end
52
+
53
+ #
54
+ # converts string into a cleaner version
55
+ # @param title [String] job title
56
+ #
57
+ # @return [String] clean and lowercase version of input
58
+ def clean_title title
59
+ title.gsub(GENDER_FILTER,'').
60
+ gsub(SYMBOL_FILTER,'').
61
+ gsub(WORDS_IN_BRACKETS, '\1').
62
+ gsub(CODE_TOKEN_FILTER,'').
63
+ gsub(WHITESPACE,' ').
64
+ downcase.
65
+ strip
66
+ end
67
+ #
68
+ # converts string into a cleaner version
69
+ # @param desc [String] job description
70
+ #
71
+ # @return [String] clean and lowercase version of input
72
+ def clean_description desc
73
+ desc.gsub(XML_TAG_FILTER,' ')
74
+ .gsub(EMAIL_FILTER,'')
75
+ .gsub(URL_FILTER,'')
76
+ .gsub(GENDER_FILTER,'')
77
+ .gsub(NEW_LINES,'')
78
+ .gsub(SYMBOL_FILTER,' ')
79
+ .gsub(WHITESPACE,' ')
80
+ .gsub(WORDS_IN_BRACKETS, '\1')
81
+ .gsub(CODE_TOKEN_FILTER,'')
82
+ .downcase
83
+ .strip
84
+ end
85
+
86
+ private
87
+ def process_jobs jobs, classification
88
+ if @parallel && RUBY_PLATFORM == 'java'
89
+ Parallel.map(jobs, in_threads: THREAD_COUNT ) {|job| process_job job, classification }
90
+ elsif @parallel
91
+ Parallel.map(jobs, in_processes: THREAD_COUNT ) {|job| process_job job, classification }
92
+ else
93
+ jobs.map {|job| process_job job, classification }
94
+ end
95
+ end
96
+
97
+ def process_job job, classification
98
+ PreprocessedData.new(
99
+ data: [ clean_title(job.title), clean_description(job.description) ],
100
+ ids: {
101
+ industry: job.classification_id(:industry),
102
+ function: job.classification_id(:function),
103
+ career_level: job.classification_id(:career_level) },
104
+ labels: {
105
+ industry: job.label(:industry),
106
+ function: job.label(:function),
107
+ career_level: job.label(:career_level) }
108
+ ).tap{|e| e.send("#{classification}!")}
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,40 @@
1
+ require_relative 'simple'
2
+ module Preprocessor
3
+ #
4
+ # Preprocessor Base Class
5
+ #
6
+ # @author Andreas Eger
7
+ #
8
+ class WithIndustryMap < Simple
9
+ attr_reader :industry_map
10
+
11
+ #
12
+ # @param args [Hash] options hash
13
+ # @option args [Hash] :industry_map mapping for the tree like industry ids to continuous ones
14
+ def initialize args={}
15
+ @industry_map = args.fetch(:industry_map){ Hash[Pjpp::Industry.select(:id).all.map(&:id).sort.map.with_index{|e,i| [e,i]}] }
16
+ end
17
+
18
+ def map_industry_id(id)
19
+ @industry_map[id]
20
+ end
21
+ def label
22
+ "with_industry_map"
23
+ end
24
+
25
+ private
26
+ def process_job job, classification
27
+ PreprocessedData.new(
28
+ data: [ clean_title(job.title), clean_description(job.description) ],
29
+ ids: {
30
+ industry: map_industry_id(job.classification_id(:industry)),
31
+ function: job.classification_id(:function),
32
+ career_level: job.classification_id(:career_level) },
33
+ labels: {
34
+ industry: job.label(:industry),
35
+ function: job.label(:function),
36
+ career_level: job.label(:career_level) }
37
+ ).tap{|e| e.send("#{classification}!")}
38
+ end
39
+ end
40
+ end