jobparser 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/jobparser.rb ADDED
@@ -0,0 +1,9 @@
1
+ require "jobparser/version"
2
+ require "jobparser/parsehtml"
3
+ require "jobparser/parseurl"
4
+ require "jobparser/cleaner"
5
+ require "jobparser/scorer"
6
+
7
+ module JobParser
8
+ # Your code goes here...
9
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+ require "nokogiri"
3
+ require "jobparser/regex"
4
+
5
+ module JobParser
6
+ class Cleaner
7
+
8
+ SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
9
+ CLEAN_SALARY_REGEX = /,|\s/
10
+ NBSP = Nokogiri::HTML(" ").text
11
+
12
+ def initialize(ary, opts = {})
13
+ @subject = ary
14
+ @type = opts[:type]
15
+ end
16
+
17
+ def clean
18
+ clean_array
19
+ end
20
+
21
+ def self.clean_salary(salary_str)
22
+ SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
23
+ [match[1].to_i, match[2].to_i]
24
+ }
25
+ end
26
+
27
+ def self.strip_string(str)
28
+ str.gsub('/n', '').gsub(NBSP, '').strip
29
+ end
30
+
31
+ def self.make_link_absolute(url, href)
32
+ if href.include?("http")
33
+ href
34
+ else
35
+ uri = URI.parse(url)
36
+ base = "#{uri.scheme}://#{uri.host}"
37
+ if base[-1] == "/" || href[0] == "/"
38
+ base + href
39
+ else
40
+ "#{base}/#{href}"
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def clean_array
48
+ @subject.select { |item|
49
+ not_whitespace_or_empty(item)
50
+ }.map { |item|
51
+ clean_string(item) if item.is_a?(String)
52
+ }.uniq
53
+ end
54
+
55
+ def clean_string(str)
56
+ self.class.strip_string(str)
57
+ end
58
+
59
+ def not_whitespace_or_empty(item)
60
+ /^\s+$/.match(item) == nil && !item.empty?
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,145 @@
1
+ # encoding: utf-8
2
+ require 'jobparser/regex.rb'
3
+ require "nokogiri"
4
+ module JobParser
5
+ class ParseHtml
6
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
7
+
8
+ attr_reader :doc
9
+
10
+ def initialize(html, from_url)
11
+ @url = from_url
12
+ @doc = Nokogiri::HTML(html)
13
+ @doc.css("br").each { |br| br.replace "\n" }
14
+ @plain_text = strip_html
15
+ end
16
+
17
+ def job
18
+ { :url => @url,
19
+ :salary => job_salary,
20
+ :title => job_title,
21
+ :apply => apply_link,
22
+ :salary_string => job_salary_string,
23
+ :location => job_location
24
+ }
25
+ end
26
+
27
+ private
28
+
29
+ def job_location
30
+ LOCATION_REGEX.match(@plain_text.gsub(/\r|\t/, "")) { |m|
31
+ Cleaner.strip_string(m[1].to_s)
32
+ } || ""
33
+ end
34
+
35
+ def strip_html
36
+ doc = @doc.dup
37
+ blacklist = ['title', 'script', 'style', 'button']
38
+ nodelist = doc.search('//text()')
39
+ blacklist.each do |tag|
40
+ nodelist -= doc.search('//' + tag + '/text()')
41
+ end
42
+ nodelist.text
43
+ end
44
+
45
+ def loop_over_elements(&block)
46
+ elements.each do |name, elems|
47
+ elems.each do |elem|
48
+ yield name, elem
49
+ end
50
+ end
51
+ end
52
+
53
+ def clean_array(ary, type = nil)
54
+ Cleaner.new(ary, :type => type).clean
55
+ end
56
+
57
+ def job_salary_string
58
+ salary = ""
59
+ loop_over_elements do |name, elem|
60
+ SALARY_STRING_REGEX.match(@plain_text) { |m|
61
+ salary = m.to_s
62
+ }
63
+ end
64
+ Cleaner.strip_string(salary)
65
+ end
66
+
67
+ def job_salary
68
+ salary = ""
69
+ loop_over_elements do |name, elem|
70
+ SALARY_REGEX.match(@plain_text) { |m|
71
+ salary = m.to_s
72
+ }
73
+ end
74
+ salary.empty? ? nil : Cleaner.clean_salary(salary)
75
+ end
76
+
77
+ def job_title
78
+ title_scorer = Scorer.new
79
+ page_title = @doc.at_css("title").content
80
+ title_scorer.store(page_title, 20).and_score_now
81
+
82
+ # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
83
+ nbsp = Nokogiri::HTML(" ").text
84
+
85
+ # first see if we find something with a matching id
86
+ loop_over_elements do |name, elem|
87
+ # check the ID of the elements for matches
88
+ next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
89
+ content = Cleaner.strip_string(elem.content)
90
+ title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
91
+
92
+ # or if a heading element matches the page title
93
+ if elem_is_heading?(name)
94
+ title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
95
+ end
96
+
97
+ title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
98
+
99
+ VACANCY_TITLE_REGEX.match(content) {
100
+ if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
101
+ next_content = Cleaner.strip_string(elem.next_element.content)
102
+ title_scorer.store(next_content, 30).if_block_true {
103
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
104
+ }
105
+ end
106
+ }
107
+ end
108
+
109
+ title_scorer.top_match.strip.gsub(nbsp, "")
110
+ end
111
+
112
+ def apply_link
113
+ link = nil
114
+ anchor_elements.each do |anchor|
115
+ APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
116
+ end
117
+ if link
118
+ Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
119
+ else
120
+ @url
121
+ end
122
+ end
123
+
124
+
125
+ def elem_is_heading?(name)
126
+ %w{h1 h2 h3 h4 h5}.include?(name)
127
+ end
128
+
129
+ def heading_elements
130
+ elements.select { |elem| elem_is_heading?(elem) }
131
+ end
132
+
133
+ def anchor_elements
134
+ elements["a"]
135
+ end
136
+
137
+ def elements
138
+ {}.tap do |response|
139
+ ACCEPTED_ELEMENTS.each do |elem|
140
+ response[elem] = doc.css(elem).to_a
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,9 @@
1
+ require 'open_uri_redirections'
2
+
3
+ module JobParser
4
+ class ParseUrl
5
+ def initialize(url)
6
+ ParseHtml.new(open(url, :allow_redirections => :safe).read, url)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ module JobParser
4
+ SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
5
+ SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
6
+ SALARY_TITLE_REGEX = /salary|\srate/i
7
+ VACANCY_TITLE_REGEX = /vacancy|job title/i
8
+ JOB_TITLE_ID_REGEX = /job(.?)title|title/i
9
+ APPLY_LINK_REGEX = /^apply|submit an application|application form/i
10
+ NBSP = Nokogiri::HTML(" ").text
11
+ LOCATION_REGEX = /(?:location: )([\w\s&]*)$/i
12
+
13
+ # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
14
+ # could scope this regex just to headers
15
+ JOB_TITLE_WORDS = /representative|sales|nurse|manager/i
16
+ end
@@ -0,0 +1,57 @@
1
+ module JobParser
2
+ class Scorer
3
+ attr_reader :matches
4
+
5
+ def initialize
6
+ @matches = {}
7
+ end
8
+
9
+ def store(str, worth)
10
+ match = nil
11
+ if match = @matches[str]
12
+ match = Match.new(str, worth, match.score)
13
+ else
14
+ match = Match.new(str, worth)
15
+ end
16
+ @matches[str] = match
17
+ match
18
+ end
19
+
20
+ def score_for(str)
21
+ @matches[str].nil? ? 0 : @matches[str].score
22
+ end
23
+
24
+ def top_match
25
+ @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
26
+ end
27
+ end
28
+
29
+ class Match
30
+ attr_accessor :str, :worth
31
+ attr_reader :score
32
+ def initialize(str, worth, score = 0)
33
+ @str = str
34
+ @score = score
35
+ @worth = worth
36
+ end
37
+
38
+ def if_regex_match(reg, str)
39
+ reg.match(str) {
40
+ @score += @worth
41
+ true
42
+ }
43
+ false
44
+ end
45
+
46
+ def if_block_true(&block)
47
+ res = yield
48
+ @score += @worth if res
49
+ res
50
+ end
51
+
52
+ def and_score_now
53
+ @score += @worth
54
+ self
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,3 @@
1
+ module JobParser
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jobparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jack Franklin
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: open_uri_redirections
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: nokogiri
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: A parser for Job sites
95
+ email:
96
+ - jack@jackfranklin.net
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - lib/jobparser/cleaner.rb
102
+ - lib/jobparser/parsehtml.rb
103
+ - lib/jobparser/parseurl.rb
104
+ - lib/jobparser/regex.rb
105
+ - lib/jobparser/scorer.rb
106
+ - lib/jobparser/version.rb
107
+ - lib/jobparser.rb
108
+ homepage: ''
109
+ licenses:
110
+ - MIT
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 1.8.23
130
+ signing_key:
131
+ specification_version: 3
132
+ summary: Parsing job sites
133
+ test_files: []