jobparser 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/jobparser.rb ADDED
@@ -0,0 +1,9 @@
1
+ require "jobparser/version"
2
+ require "jobparser/parsehtml"
3
+ require "jobparser/parseurl"
4
+ require "jobparser/cleaner"
5
+ require "jobparser/scorer"
6
+
7
+ module JobParser
8
+ # Your code goes here...
9
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+ require "nokogiri"
3
+ require "jobparser/regex"
4
+
5
+ module JobParser
6
+ class Cleaner
7
+
8
+ SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
9
+ CLEAN_SALARY_REGEX = /,|\s/
10
+ NBSP = Nokogiri::HTML(" ").text
11
+
12
+ def initialize(ary, opts = {})
13
+ @subject = ary
14
+ @type = opts[:type]
15
+ end
16
+
17
+ def clean
18
+ clean_array
19
+ end
20
+
21
+ def self.clean_salary(salary_str)
22
+ SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
23
+ [match[1].to_i, match[2].to_i]
24
+ }
25
+ end
26
+
27
+ def self.strip_string(str)
28
+ str.gsub('/n', '').gsub(NBSP, '').strip
29
+ end
30
+
31
+ def self.make_link_absolute(url, href)
32
+ if href.include?("http")
33
+ href
34
+ else
35
+ uri = URI.parse(url)
36
+ base = "#{uri.scheme}://#{uri.host}"
37
+ if base[-1] == "/" || href[0] == "/"
38
+ base + href
39
+ else
40
+ "#{base}/#{href}"
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def clean_array
48
+ @subject.select { |item|
49
+ not_whitespace_or_empty(item)
50
+ }.map { |item|
51
+ clean_string(item) if item.is_a?(String)
52
+ }.uniq
53
+ end
54
+
55
+ def clean_string(str)
56
+ self.class.strip_string(str)
57
+ end
58
+
59
+ def not_whitespace_or_empty(item)
60
+ /^\s+$/.match(item) == nil && !item.empty?
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,145 @@
1
+ # encoding: utf-8
2
+ require 'jobparser/regex.rb'
3
+ require "nokogiri"
4
+ module JobParser
5
+ class ParseHtml
6
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
7
+
8
+ attr_reader :doc
9
+
10
+ def initialize(html, from_url)
11
+ @url = from_url
12
+ @doc = Nokogiri::HTML(html)
13
+ @doc.css("br").each { |br| br.replace "\n" }
14
+ @plain_text = strip_html
15
+ end
16
+
17
+ def job
18
+ { :url => @url,
19
+ :salary => job_salary,
20
+ :title => job_title,
21
+ :apply => apply_link,
22
+ :salary_string => job_salary_string,
23
+ :location => job_location
24
+ }
25
+ end
26
+
27
+ private
28
+
29
+ def job_location
30
+ LOCATION_REGEX.match(@plain_text.gsub(/\r|\t/, "")) { |m|
31
+ Cleaner.strip_string(m[1].to_s)
32
+ } || ""
33
+ end
34
+
35
+ def strip_html
36
+ doc = @doc.dup
37
+ blacklist = ['title', 'script', 'style', 'button']
38
+ nodelist = doc.search('//text()')
39
+ blacklist.each do |tag|
40
+ nodelist -= doc.search('//' + tag + '/text()')
41
+ end
42
+ nodelist.text
43
+ end
44
+
45
+ def loop_over_elements(&block)
46
+ elements.each do |name, elems|
47
+ elems.each do |elem|
48
+ yield name, elem
49
+ end
50
+ end
51
+ end
52
+
53
+ def clean_array(ary, type = nil)
54
+ Cleaner.new(ary, :type => type).clean
55
+ end
56
+
57
+ def job_salary_string
58
+ salary = ""
59
+ loop_over_elements do |name, elem|
60
+ SALARY_STRING_REGEX.match(@plain_text) { |m|
61
+ salary = m.to_s
62
+ }
63
+ end
64
+ Cleaner.strip_string(salary)
65
+ end
66
+
67
+ def job_salary
68
+ salary = ""
69
+ loop_over_elements do |name, elem|
70
+ SALARY_REGEX.match(@plain_text) { |m|
71
+ salary = m.to_s
72
+ }
73
+ end
74
+ salary.empty? ? nil : Cleaner.clean_salary(salary)
75
+ end
76
+
77
+ def job_title
78
+ title_scorer = Scorer.new
79
+ page_title = @doc.at_css("title").content
80
+ title_scorer.store(page_title, 20).and_score_now
81
+
82
+ # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
83
+ nbsp = Nokogiri::HTML(" ").text
84
+
85
+ # first see if we find something with a matching id
86
+ loop_over_elements do |name, elem|
87
+ # check the ID of the elements for matches
88
+ next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
89
+ content = Cleaner.strip_string(elem.content)
90
+ title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
91
+
92
+ # or if a heading element matches the page title
93
+ if elem_is_heading?(name)
94
+ title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
95
+ end
96
+
97
+ title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
98
+
99
+ VACANCY_TITLE_REGEX.match(content) {
100
+ if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
101
+ next_content = Cleaner.strip_string(elem.next_element.content)
102
+ title_scorer.store(next_content, 30).if_block_true {
103
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
104
+ }
105
+ end
106
+ }
107
+ end
108
+
109
+ title_scorer.top_match.strip.gsub(nbsp, "")
110
+ end
111
+
112
+ def apply_link
113
+ link = nil
114
+ anchor_elements.each do |anchor|
115
+ APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
116
+ end
117
+ if link
118
+ Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
119
+ else
120
+ @url
121
+ end
122
+ end
123
+
124
+
125
+ def elem_is_heading?(name)
126
+ %w{h1 h2 h3 h4 h5}.include?(name)
127
+ end
128
+
129
+ def heading_elements
130
+ elements.select { |elem| elem_is_heading?(elem) }
131
+ end
132
+
133
+ def anchor_elements
134
+ elements["a"]
135
+ end
136
+
137
+ def elements
138
+ {}.tap do |response|
139
+ ACCEPTED_ELEMENTS.each do |elem|
140
+ response[elem] = doc.css(elem).to_a
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,9 @@
1
+ require 'open_uri_redirections'
2
+
3
+ module JobParser
4
+ class ParseUrl
5
+ def initialize(url)
6
+ ParseHtml.new(open(url, :allow_redirections => :safe).read, url)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ module JobParser
4
+ SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
5
+ SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
6
+ SALARY_TITLE_REGEX = /salary|\srate/i
7
+ VACANCY_TITLE_REGEX = /vacancy|job title/i
8
+ JOB_TITLE_ID_REGEX = /job(.?)title|title/i
9
+ APPLY_LINK_REGEX = /^apply|submit an application|application form/i
10
+ NBSP = Nokogiri::HTML(" ").text
11
+ LOCATION_REGEX = /(?:location: )([\w\s&]*)$/i
12
+
13
+ # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
14
+ # could scope this regex just to headers
15
+ JOB_TITLE_WORDS = /representative|sales|nurse|manager/i
16
+ end
@@ -0,0 +1,57 @@
1
+ module JobParser
2
+ class Scorer
3
+ attr_reader :matches
4
+
5
+ def initialize
6
+ @matches = {}
7
+ end
8
+
9
+ def store(str, worth)
10
+ match = nil
11
+ if match = @matches[str]
12
+ match = Match.new(str, worth, match.score)
13
+ else
14
+ match = Match.new(str, worth)
15
+ end
16
+ @matches[str] = match
17
+ match
18
+ end
19
+
20
+ def score_for(str)
21
+ @matches[str].nil? ? 0 : @matches[str].score
22
+ end
23
+
24
+ def top_match
25
+ @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
26
+ end
27
+ end
28
+
29
+ class Match
30
+ attr_accessor :str, :worth
31
+ attr_reader :score
32
+ def initialize(str, worth, score = 0)
33
+ @str = str
34
+ @score = score
35
+ @worth = worth
36
+ end
37
+
38
+ def if_regex_match(reg, str)
39
+ reg.match(str) {
40
+ @score += @worth
41
+ true
42
+ }
43
+ false
44
+ end
45
+
46
+ def if_block_true(&block)
47
+ res = yield
48
+ @score += @worth if res
49
+ res
50
+ end
51
+
52
+ def and_score_now
53
+ @score += @worth
54
+ self
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,3 @@
1
+ module JobParser
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jobparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jack Franklin
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: open_uri_redirections
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: nokogiri
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: A parser for Job sites
95
+ email:
96
+ - jack@jackfranklin.net
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - lib/jobparser/cleaner.rb
102
+ - lib/jobparser/parsehtml.rb
103
+ - lib/jobparser/parseurl.rb
104
+ - lib/jobparser/regex.rb
105
+ - lib/jobparser/scorer.rb
106
+ - lib/jobparser/version.rb
107
+ - lib/jobparser.rb
108
+ homepage: ''
109
+ licenses:
110
+ - MIT
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 1.8.23
130
+ signing_key:
131
+ specification_version: 3
132
+ summary: Parsing job sites
133
+ test_files: []