jobparser 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser.rb +9 -0
- data/lib/jobparser/cleaner.rb +63 -0
- data/lib/jobparser/parsehtml.rb +145 -0
- data/lib/jobparser/parseurl.rb +9 -0
- data/lib/jobparser/regex.rb +16 -0
- data/lib/jobparser/scorer.rb +57 -0
- data/lib/jobparser/version.rb +3 -0
- metadata +133 -0
data/lib/jobparser.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "nokogiri"
|
3
|
+
require "jobparser/regex"
|
4
|
+
|
5
|
+
module JobParser
|
6
|
+
class Cleaner
|
7
|
+
|
8
|
+
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
9
|
+
CLEAN_SALARY_REGEX = /,|\s/
|
10
|
+
NBSP = Nokogiri::HTML(" ").text
|
11
|
+
|
12
|
+
def initialize(ary, opts = {})
|
13
|
+
@subject = ary
|
14
|
+
@type = opts[:type]
|
15
|
+
end
|
16
|
+
|
17
|
+
def clean
|
18
|
+
clean_array
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.clean_salary(salary_str)
|
22
|
+
SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
23
|
+
[match[1].to_i, match[2].to_i]
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.strip_string(str)
|
28
|
+
str.gsub('/n', '').gsub(NBSP, '').strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.make_link_absolute(url, href)
|
32
|
+
if href.include?("http")
|
33
|
+
href
|
34
|
+
else
|
35
|
+
uri = URI.parse(url)
|
36
|
+
base = "#{uri.scheme}://#{uri.host}"
|
37
|
+
if base[-1] == "/" || href[0] == "/"
|
38
|
+
base + href
|
39
|
+
else
|
40
|
+
"#{base}/#{href}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def clean_array
|
48
|
+
@subject.select { |item|
|
49
|
+
not_whitespace_or_empty(item)
|
50
|
+
}.map { |item|
|
51
|
+
clean_string(item) if item.is_a?(String)
|
52
|
+
}.uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
def clean_string(str)
|
56
|
+
self.class.strip_string(str)
|
57
|
+
end
|
58
|
+
|
59
|
+
def not_whitespace_or_empty(item)
|
60
|
+
/^\s+$/.match(item) == nil && !item.empty?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'jobparser/regex.rb'
|
3
|
+
require "nokogiri"
|
4
|
+
module JobParser
|
5
|
+
class ParseHtml
|
6
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
7
|
+
|
8
|
+
attr_reader :doc
|
9
|
+
|
10
|
+
def initialize(html, from_url)
|
11
|
+
@url = from_url
|
12
|
+
@doc = Nokogiri::HTML(html)
|
13
|
+
@doc.css("br").each { |br| br.replace "\n" }
|
14
|
+
@plain_text = strip_html
|
15
|
+
end
|
16
|
+
|
17
|
+
def job
|
18
|
+
{ :url => @url,
|
19
|
+
:salary => job_salary,
|
20
|
+
:title => job_title,
|
21
|
+
:apply => apply_link,
|
22
|
+
:salary_string => job_salary_string,
|
23
|
+
:location => job_location
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def job_location
|
30
|
+
LOCATION_REGEX.match(@plain_text.gsub(/\r|\t/, "")) { |m|
|
31
|
+
Cleaner.strip_string(m[1].to_s)
|
32
|
+
} || ""
|
33
|
+
end
|
34
|
+
|
35
|
+
def strip_html
|
36
|
+
doc = @doc.dup
|
37
|
+
blacklist = ['title', 'script', 'style', 'button']
|
38
|
+
nodelist = doc.search('//text()')
|
39
|
+
blacklist.each do |tag|
|
40
|
+
nodelist -= doc.search('//' + tag + '/text()')
|
41
|
+
end
|
42
|
+
nodelist.text
|
43
|
+
end
|
44
|
+
|
45
|
+
def loop_over_elements(&block)
|
46
|
+
elements.each do |name, elems|
|
47
|
+
elems.each do |elem|
|
48
|
+
yield name, elem
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def clean_array(ary, type = nil)
|
54
|
+
Cleaner.new(ary, :type => type).clean
|
55
|
+
end
|
56
|
+
|
57
|
+
def job_salary_string
|
58
|
+
salary = ""
|
59
|
+
loop_over_elements do |name, elem|
|
60
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
61
|
+
salary = m.to_s
|
62
|
+
}
|
63
|
+
end
|
64
|
+
Cleaner.strip_string(salary)
|
65
|
+
end
|
66
|
+
|
67
|
+
def job_salary
|
68
|
+
salary = ""
|
69
|
+
loop_over_elements do |name, elem|
|
70
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
71
|
+
salary = m.to_s
|
72
|
+
}
|
73
|
+
end
|
74
|
+
salary.empty? ? nil : Cleaner.clean_salary(salary)
|
75
|
+
end
|
76
|
+
|
77
|
+
def job_title
|
78
|
+
title_scorer = Scorer.new
|
79
|
+
page_title = @doc.at_css("title").content
|
80
|
+
title_scorer.store(page_title, 20).and_score_now
|
81
|
+
|
82
|
+
# http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
|
83
|
+
nbsp = Nokogiri::HTML(" ").text
|
84
|
+
|
85
|
+
# first see if we find something with a matching id
|
86
|
+
loop_over_elements do |name, elem|
|
87
|
+
# check the ID of the elements for matches
|
88
|
+
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
89
|
+
content = Cleaner.strip_string(elem.content)
|
90
|
+
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
91
|
+
|
92
|
+
# or if a heading element matches the page title
|
93
|
+
if elem_is_heading?(name)
|
94
|
+
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
95
|
+
end
|
96
|
+
|
97
|
+
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
98
|
+
|
99
|
+
VACANCY_TITLE_REGEX.match(content) {
|
100
|
+
if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
|
101
|
+
next_content = Cleaner.strip_string(elem.next_element.content)
|
102
|
+
title_scorer.store(next_content, 30).if_block_true {
|
103
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
title_scorer.top_match.strip.gsub(nbsp, "")
|
110
|
+
end
|
111
|
+
|
112
|
+
def apply_link
|
113
|
+
link = nil
|
114
|
+
anchor_elements.each do |anchor|
|
115
|
+
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
116
|
+
end
|
117
|
+
if link
|
118
|
+
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
119
|
+
else
|
120
|
+
@url
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
def elem_is_heading?(name)
|
126
|
+
%w{h1 h2 h3 h4 h5}.include?(name)
|
127
|
+
end
|
128
|
+
|
129
|
+
def heading_elements
|
130
|
+
elements.select { |elem| elem_is_heading?(elem) }
|
131
|
+
end
|
132
|
+
|
133
|
+
def anchor_elements
|
134
|
+
elements["a"]
|
135
|
+
end
|
136
|
+
|
137
|
+
def elements
|
138
|
+
{}.tap do |response|
|
139
|
+
ACCEPTED_ELEMENTS.each do |elem|
|
140
|
+
response[elem] = doc.css(elem).to_a
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
module JobParser
|
4
|
+
SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
|
5
|
+
SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
|
6
|
+
SALARY_TITLE_REGEX = /salary|\srate/i
|
7
|
+
VACANCY_TITLE_REGEX = /vacancy|job title/i
|
8
|
+
JOB_TITLE_ID_REGEX = /job(.?)title|title/i
|
9
|
+
APPLY_LINK_REGEX = /^apply|submit an application|application form/i
|
10
|
+
NBSP = Nokogiri::HTML(" ").text
|
11
|
+
LOCATION_REGEX = /(?:location: )([\w\s&]*)$/i
|
12
|
+
|
13
|
+
# words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
|
14
|
+
# could scope this regex just to headers
|
15
|
+
JOB_TITLE_WORDS = /representative|sales|nurse|manager/i
|
16
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module JobParser
|
2
|
+
class Scorer
|
3
|
+
attr_reader :matches
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@matches = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def store(str, worth)
|
10
|
+
match = nil
|
11
|
+
if match = @matches[str]
|
12
|
+
match = Match.new(str, worth, match.score)
|
13
|
+
else
|
14
|
+
match = Match.new(str, worth)
|
15
|
+
end
|
16
|
+
@matches[str] = match
|
17
|
+
match
|
18
|
+
end
|
19
|
+
|
20
|
+
def score_for(str)
|
21
|
+
@matches[str].nil? ? 0 : @matches[str].score
|
22
|
+
end
|
23
|
+
|
24
|
+
def top_match
|
25
|
+
@matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Match
|
30
|
+
attr_accessor :str, :worth
|
31
|
+
attr_reader :score
|
32
|
+
def initialize(str, worth, score = 0)
|
33
|
+
@str = str
|
34
|
+
@score = score
|
35
|
+
@worth = worth
|
36
|
+
end
|
37
|
+
|
38
|
+
def if_regex_match(reg, str)
|
39
|
+
reg.match(str) {
|
40
|
+
@score += @worth
|
41
|
+
true
|
42
|
+
}
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
def if_block_true(&block)
|
47
|
+
res = yield
|
48
|
+
@score += @worth if res
|
49
|
+
res
|
50
|
+
end
|
51
|
+
|
52
|
+
def and_score_now
|
53
|
+
@score += @worth
|
54
|
+
self
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jobparser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jack Franklin
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-22 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: open_uri_redirections
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: nokogiri
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: A parser for Job sites
|
95
|
+
email:
|
96
|
+
- jack@jackfranklin.net
|
97
|
+
executables: []
|
98
|
+
extensions: []
|
99
|
+
extra_rdoc_files: []
|
100
|
+
files:
|
101
|
+
- lib/jobparser/cleaner.rb
|
102
|
+
- lib/jobparser/parsehtml.rb
|
103
|
+
- lib/jobparser/parseurl.rb
|
104
|
+
- lib/jobparser/regex.rb
|
105
|
+
- lib/jobparser/scorer.rb
|
106
|
+
- lib/jobparser/version.rb
|
107
|
+
- lib/jobparser.rb
|
108
|
+
homepage: ''
|
109
|
+
licenses:
|
110
|
+
- MIT
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
requirements: []
|
128
|
+
rubyforge_project:
|
129
|
+
rubygems_version: 1.8.23
|
130
|
+
signing_key:
|
131
|
+
specification_version: 3
|
132
|
+
summary: Parsing job sites
|
133
|
+
test_files: []
|