jobparser 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser.rb +9 -0
- data/lib/jobparser/cleaner.rb +63 -0
- data/lib/jobparser/parsehtml.rb +145 -0
- data/lib/jobparser/parseurl.rb +9 -0
- data/lib/jobparser/regex.rb +16 -0
- data/lib/jobparser/scorer.rb +57 -0
- data/lib/jobparser/version.rb +3 -0
- metadata +133 -0
data/lib/jobparser.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "nokogiri"
|
3
|
+
require "jobparser/regex"
|
4
|
+
|
5
|
+
module JobParser
|
6
|
+
class Cleaner
|
7
|
+
|
8
|
+
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
9
|
+
CLEAN_SALARY_REGEX = /,|\s/
|
10
|
+
NBSP = Nokogiri::HTML(" ").text
|
11
|
+
|
12
|
+
def initialize(ary, opts = {})
|
13
|
+
@subject = ary
|
14
|
+
@type = opts[:type]
|
15
|
+
end
|
16
|
+
|
17
|
+
def clean
|
18
|
+
clean_array
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.clean_salary(salary_str)
|
22
|
+
SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
23
|
+
[match[1].to_i, match[2].to_i]
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.strip_string(str)
|
28
|
+
str.gsub('/n', '').gsub(NBSP, '').strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.make_link_absolute(url, href)
|
32
|
+
if href.include?("http")
|
33
|
+
href
|
34
|
+
else
|
35
|
+
uri = URI.parse(url)
|
36
|
+
base = "#{uri.scheme}://#{uri.host}"
|
37
|
+
if base[-1] == "/" || href[0] == "/"
|
38
|
+
base + href
|
39
|
+
else
|
40
|
+
"#{base}/#{href}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def clean_array
|
48
|
+
@subject.select { |item|
|
49
|
+
not_whitespace_or_empty(item)
|
50
|
+
}.map { |item|
|
51
|
+
clean_string(item) if item.is_a?(String)
|
52
|
+
}.uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
def clean_string(str)
|
56
|
+
self.class.strip_string(str)
|
57
|
+
end
|
58
|
+
|
59
|
+
def not_whitespace_or_empty(item)
|
60
|
+
/^\s+$/.match(item) == nil && !item.empty?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'jobparser/regex.rb'
|
3
|
+
require "nokogiri"
|
4
|
+
module JobParser
|
5
|
+
class ParseHtml
|
6
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
7
|
+
|
8
|
+
attr_reader :doc
|
9
|
+
|
10
|
+
def initialize(html, from_url)
|
11
|
+
@url = from_url
|
12
|
+
@doc = Nokogiri::HTML(html)
|
13
|
+
@doc.css("br").each { |br| br.replace "\n" }
|
14
|
+
@plain_text = strip_html
|
15
|
+
end
|
16
|
+
|
17
|
+
def job
|
18
|
+
{ :url => @url,
|
19
|
+
:salary => job_salary,
|
20
|
+
:title => job_title,
|
21
|
+
:apply => apply_link,
|
22
|
+
:salary_string => job_salary_string,
|
23
|
+
:location => job_location
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def job_location
|
30
|
+
LOCATION_REGEX.match(@plain_text.gsub(/\r|\t/, "")) { |m|
|
31
|
+
Cleaner.strip_string(m[1].to_s)
|
32
|
+
} || ""
|
33
|
+
end
|
34
|
+
|
35
|
+
def strip_html
|
36
|
+
doc = @doc.dup
|
37
|
+
blacklist = ['title', 'script', 'style', 'button']
|
38
|
+
nodelist = doc.search('//text()')
|
39
|
+
blacklist.each do |tag|
|
40
|
+
nodelist -= doc.search('//' + tag + '/text()')
|
41
|
+
end
|
42
|
+
nodelist.text
|
43
|
+
end
|
44
|
+
|
45
|
+
def loop_over_elements(&block)
|
46
|
+
elements.each do |name, elems|
|
47
|
+
elems.each do |elem|
|
48
|
+
yield name, elem
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def clean_array(ary, type = nil)
|
54
|
+
Cleaner.new(ary, :type => type).clean
|
55
|
+
end
|
56
|
+
|
57
|
+
def job_salary_string
|
58
|
+
salary = ""
|
59
|
+
loop_over_elements do |name, elem|
|
60
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
61
|
+
salary = m.to_s
|
62
|
+
}
|
63
|
+
end
|
64
|
+
Cleaner.strip_string(salary)
|
65
|
+
end
|
66
|
+
|
67
|
+
def job_salary
|
68
|
+
salary = ""
|
69
|
+
loop_over_elements do |name, elem|
|
70
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
71
|
+
salary = m.to_s
|
72
|
+
}
|
73
|
+
end
|
74
|
+
salary.empty? ? nil : Cleaner.clean_salary(salary)
|
75
|
+
end
|
76
|
+
|
77
|
+
def job_title
|
78
|
+
title_scorer = Scorer.new
|
79
|
+
page_title = @doc.at_css("title").content
|
80
|
+
title_scorer.store(page_title, 20).and_score_now
|
81
|
+
|
82
|
+
# http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
|
83
|
+
nbsp = Nokogiri::HTML(" ").text
|
84
|
+
|
85
|
+
# first see if we find something with a matching id
|
86
|
+
loop_over_elements do |name, elem|
|
87
|
+
# check the ID of the elements for matches
|
88
|
+
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
89
|
+
content = Cleaner.strip_string(elem.content)
|
90
|
+
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
91
|
+
|
92
|
+
# or if a heading element matches the page title
|
93
|
+
if elem_is_heading?(name)
|
94
|
+
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
95
|
+
end
|
96
|
+
|
97
|
+
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
98
|
+
|
99
|
+
VACANCY_TITLE_REGEX.match(content) {
|
100
|
+
if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
|
101
|
+
next_content = Cleaner.strip_string(elem.next_element.content)
|
102
|
+
title_scorer.store(next_content, 30).if_block_true {
|
103
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
title_scorer.top_match.strip.gsub(nbsp, "")
|
110
|
+
end
|
111
|
+
|
112
|
+
def apply_link
|
113
|
+
link = nil
|
114
|
+
anchor_elements.each do |anchor|
|
115
|
+
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
116
|
+
end
|
117
|
+
if link
|
118
|
+
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
119
|
+
else
|
120
|
+
@url
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
def elem_is_heading?(name)
|
126
|
+
%w{h1 h2 h3 h4 h5}.include?(name)
|
127
|
+
end
|
128
|
+
|
129
|
+
def heading_elements
|
130
|
+
elements.select { |elem| elem_is_heading?(elem) }
|
131
|
+
end
|
132
|
+
|
133
|
+
def anchor_elements
|
134
|
+
elements["a"]
|
135
|
+
end
|
136
|
+
|
137
|
+
def elements
|
138
|
+
{}.tap do |response|
|
139
|
+
ACCEPTED_ELEMENTS.each do |elem|
|
140
|
+
response[elem] = doc.css(elem).to_a
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
module JobParser
|
4
|
+
SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
|
5
|
+
SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
|
6
|
+
SALARY_TITLE_REGEX = /salary|\srate/i
|
7
|
+
VACANCY_TITLE_REGEX = /vacancy|job title/i
|
8
|
+
JOB_TITLE_ID_REGEX = /job(.?)title|title/i
|
9
|
+
APPLY_LINK_REGEX = /^apply|submit an application|application form/i
|
10
|
+
NBSP = Nokogiri::HTML(" ").text
|
11
|
+
LOCATION_REGEX = /(?:location: )([\w\s&]*)$/i
|
12
|
+
|
13
|
+
# words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
|
14
|
+
# could scope this regex just to headers
|
15
|
+
JOB_TITLE_WORDS = /representative|sales|nurse|manager/i
|
16
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module JobParser
|
2
|
+
class Scorer
|
3
|
+
attr_reader :matches
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@matches = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def store(str, worth)
|
10
|
+
match = nil
|
11
|
+
if match = @matches[str]
|
12
|
+
match = Match.new(str, worth, match.score)
|
13
|
+
else
|
14
|
+
match = Match.new(str, worth)
|
15
|
+
end
|
16
|
+
@matches[str] = match
|
17
|
+
match
|
18
|
+
end
|
19
|
+
|
20
|
+
def score_for(str)
|
21
|
+
@matches[str].nil? ? 0 : @matches[str].score
|
22
|
+
end
|
23
|
+
|
24
|
+
def top_match
|
25
|
+
@matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Match
|
30
|
+
attr_accessor :str, :worth
|
31
|
+
attr_reader :score
|
32
|
+
def initialize(str, worth, score = 0)
|
33
|
+
@str = str
|
34
|
+
@score = score
|
35
|
+
@worth = worth
|
36
|
+
end
|
37
|
+
|
38
|
+
def if_regex_match(reg, str)
|
39
|
+
reg.match(str) {
|
40
|
+
@score += @worth
|
41
|
+
true
|
42
|
+
}
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
def if_block_true(&block)
|
47
|
+
res = yield
|
48
|
+
@score += @worth if res
|
49
|
+
res
|
50
|
+
end
|
51
|
+
|
52
|
+
def and_score_now
|
53
|
+
@score += @worth
|
54
|
+
self
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jobparser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jack Franklin
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-22 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: open_uri_redirections
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: nokogiri
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: A parser for Job sites
|
95
|
+
email:
|
96
|
+
- jack@jackfranklin.net
|
97
|
+
executables: []
|
98
|
+
extensions: []
|
99
|
+
extra_rdoc_files: []
|
100
|
+
files:
|
101
|
+
- lib/jobparser/cleaner.rb
|
102
|
+
- lib/jobparser/parsehtml.rb
|
103
|
+
- lib/jobparser/parseurl.rb
|
104
|
+
- lib/jobparser/regex.rb
|
105
|
+
- lib/jobparser/scorer.rb
|
106
|
+
- lib/jobparser/version.rb
|
107
|
+
- lib/jobparser.rb
|
108
|
+
homepage: ''
|
109
|
+
licenses:
|
110
|
+
- MIT
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
requirements: []
|
128
|
+
rubyforge_project:
|
129
|
+
rubygems_version: 1.8.23
|
130
|
+
signing_key:
|
131
|
+
specification_version: 3
|
132
|
+
summary: Parsing job sites
|
133
|
+
test_files: []
|