jobparser 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser/cleaner.rb +6 -37
- data/lib/jobparser/facets/apply.rb +18 -0
- data/lib/jobparser/facets/facet.rb +41 -0
- data/lib/jobparser/facets/location.rb +15 -0
- data/lib/jobparser/facets/salary.rb +20 -0
- data/lib/jobparser/facets/salarystring.rb +18 -0
- data/lib/jobparser/facets/title.rb +46 -0
- data/lib/jobparser/parsehtml.rb +6 -111
- data/lib/jobparser/regex.rb +2 -0
- data/lib/jobparser/specialcases.rb +1 -1
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +6 -0
- metadata +8 -2
data/lib/jobparser/cleaner.rb
CHANGED
@@ -5,30 +5,16 @@ require "jobparser/regex"
|
|
5
5
|
module JobParser
|
6
6
|
class Cleaner
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def initialize(ary, opts = {})
|
12
|
-
@subject = ary
|
13
|
-
@type = opts[:type]
|
14
|
-
end
|
15
|
-
|
16
|
-
def clean
|
17
|
-
clean_array
|
8
|
+
def self.strip_all_white_space(str)
|
9
|
+
Cleaner.clean_text(Cleaner.remove_nbsp(str.gsub('\n', ''))).strip
|
18
10
|
end
|
19
11
|
|
20
|
-
def self.
|
21
|
-
|
22
|
-
[match[1].to_i, match[2].to_i]
|
23
|
-
}
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.strip_string(str)
|
27
|
-
str.gsub('/n', '').gsub(NBSP, '').strip
|
12
|
+
def self.clean_text(str)
|
13
|
+
str.gsub(/\r|\t/, "").gsub(NBSP, " ")
|
28
14
|
end
|
29
15
|
|
30
|
-
def self.
|
31
|
-
str.gsub(
|
16
|
+
def self.remove_nbsp(str)
|
17
|
+
str.gsub(NBSP, "")
|
32
18
|
end
|
33
19
|
|
34
20
|
def self.make_link_absolute(url, href)
|
@@ -45,22 +31,5 @@ module JobParser
|
|
45
31
|
end
|
46
32
|
end
|
47
33
|
|
48
|
-
private
|
49
|
-
|
50
|
-
def clean_array
|
51
|
-
@subject.select { |item|
|
52
|
-
not_whitespace_or_empty(item)
|
53
|
-
}.map { |item|
|
54
|
-
clean_string(item) if item.is_a?(String)
|
55
|
-
}.uniq
|
56
|
-
end
|
57
|
-
|
58
|
-
def clean_string(str)
|
59
|
-
self.class.strip_string(str)
|
60
|
-
end
|
61
|
-
|
62
|
-
def not_whitespace_or_empty(item)
|
63
|
-
/^\s+$/.match(item) == nil && !item.empty?
|
64
|
-
end
|
65
34
|
end
|
66
35
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Apply < Facet
|
4
|
+
def parse
|
5
|
+
link = nil
|
6
|
+
elements["a"].each do |anchor|
|
7
|
+
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
8
|
+
end
|
9
|
+
if link
|
10
|
+
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
11
|
+
else
|
12
|
+
@url
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Facet
|
4
|
+
|
5
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
|
+
|
7
|
+
attr_reader :doc, :url, :plain_text
|
8
|
+
|
9
|
+
def initialize(doc, url, plain_text)
|
10
|
+
@doc = doc
|
11
|
+
@url = url
|
12
|
+
@plain_text = plain_text
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def use_special_case(name)
|
18
|
+
if special_case = SpecialCases.case_for_url(@url)
|
19
|
+
special_case[name].call(@doc)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def loop_over_elements(&block)
|
24
|
+
elements.each do |name, elems|
|
25
|
+
elems.each do |elem|
|
26
|
+
yield name, elem
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def elements
|
32
|
+
{}.tap do |response|
|
33
|
+
ACCEPTED_ELEMENTS.each do |elem|
|
34
|
+
response[elem] = @doc.css(elem).to_a
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Location < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:location)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
LOCATION_REGEX.match(Cleaner.clean_text(@plain_text)) { |m|
|
9
|
+
Cleaner.strip_all_white_space(m[1].to_s)
|
10
|
+
} || ""
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Salary < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:salary)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
salary = ""
|
9
|
+
loop_over_elements do |name, elem|
|
10
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
11
|
+
salary = m.to_s
|
12
|
+
}
|
13
|
+
end
|
14
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
15
|
+
[match[1].to_i, match[2].to_i]
|
16
|
+
} || nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class SalaryString < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:salary_string)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
salary = ""
|
9
|
+
loop_over_elements do |name, elem|
|
10
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
11
|
+
salary = m.to_s
|
12
|
+
}
|
13
|
+
end
|
14
|
+
Cleaner.strip_all_white_space(salary)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Title < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:title)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
title_scorer = Scorer.new
|
9
|
+
page_title = @doc.at_css("title").content
|
10
|
+
title_scorer.store(page_title, 20).and_score_now
|
11
|
+
|
12
|
+
# first see if we find something with a matching id
|
13
|
+
loop_over_elements do |name, elem|
|
14
|
+
# check the ID of the elements for matches
|
15
|
+
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
16
|
+
content = Cleaner.strip_all_white_space(elem.content)
|
17
|
+
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
18
|
+
|
19
|
+
# or if a heading element matches the page title
|
20
|
+
if elem_is_heading?(name)
|
21
|
+
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
22
|
+
end
|
23
|
+
|
24
|
+
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
25
|
+
|
26
|
+
VACANCY_TITLE_REGEX.match(content) {
|
27
|
+
if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
|
28
|
+
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
29
|
+
title_scorer.store(next_content, 30).if_block_true {
|
30
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
title_scorer.top_match.strip.gsub(NBSP, "")
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def elem_is_heading?(name)
|
42
|
+
%w{h1 h2 h3 h4 h5}.include?(name)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require 'jobparser/regex.rb'
|
3
2
|
require "nokogiri"
|
4
3
|
module JobParser
|
5
4
|
class ParseHtml
|
@@ -26,15 +25,6 @@ module JobParser
|
|
26
25
|
|
27
26
|
private
|
28
27
|
|
29
|
-
def job_location
|
30
|
-
special_case_result = use_special_case(:location)
|
31
|
-
return special_case_result unless special_case_result.nil?
|
32
|
-
|
33
|
-
LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
|
34
|
-
Cleaner.strip_string(m[1].to_s)
|
35
|
-
} || ""
|
36
|
-
end
|
37
|
-
|
38
28
|
def strip_html
|
39
29
|
doc = @doc.dup
|
40
30
|
blacklist = ['title', 'script', 'style', 'button']
|
@@ -45,119 +35,24 @@ module JobParser
|
|
45
35
|
nodelist.text
|
46
36
|
end
|
47
37
|
|
48
|
-
def
|
49
|
-
|
50
|
-
elems.each do |elem|
|
51
|
-
yield name, elem
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def clean_array(ary, type = nil)
|
57
|
-
Cleaner.new(ary, :type => type).clean
|
58
|
-
end
|
59
|
-
|
60
|
-
def use_special_case(name)
|
61
|
-
if special_case = SpecialCases.case_for_url(@url)
|
62
|
-
special_case[name].call(@doc)
|
63
|
-
end
|
38
|
+
def job_location
|
39
|
+
Facets::Location.new(@doc, @url, @plain_text).parse
|
64
40
|
end
|
65
41
|
|
66
42
|
def job_salary_string
|
67
|
-
|
68
|
-
return special_case_result unless special_case_result.nil?
|
69
|
-
|
70
|
-
salary = ""
|
71
|
-
loop_over_elements do |name, elem|
|
72
|
-
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
73
|
-
salary = m.to_s
|
74
|
-
}
|
75
|
-
end
|
76
|
-
Cleaner.strip_string(salary)
|
43
|
+
Facets::SalaryString.new(@doc, @url, @plain_text).parse
|
77
44
|
end
|
78
45
|
|
79
46
|
def job_salary
|
80
|
-
|
81
|
-
return special_case_result unless special_case_result.nil?
|
82
|
-
|
83
|
-
salary = ""
|
84
|
-
loop_over_elements do |name, elem|
|
85
|
-
SALARY_REGEX.match(@plain_text) { |m|
|
86
|
-
salary = m.to_s
|
87
|
-
}
|
88
|
-
end
|
89
|
-
salary.empty? ? nil : Cleaner.clean_salary(salary)
|
47
|
+
Facets::Salary.new(@doc, @url, @plain_text).parse
|
90
48
|
end
|
91
49
|
|
92
50
|
def job_title
|
93
|
-
|
94
|
-
return special_case_result unless special_case_result.nil?
|
95
|
-
|
96
|
-
title_scorer = Scorer.new
|
97
|
-
page_title = @doc.at_css("title").content
|
98
|
-
title_scorer.store(page_title, 20).and_score_now
|
99
|
-
|
100
|
-
# http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
|
101
|
-
nbsp = Nokogiri::HTML(" ").text
|
102
|
-
|
103
|
-
# first see if we find something with a matching id
|
104
|
-
loop_over_elements do |name, elem|
|
105
|
-
# check the ID of the elements for matches
|
106
|
-
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
107
|
-
content = Cleaner.strip_string(elem.content)
|
108
|
-
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
109
|
-
|
110
|
-
# or if a heading element matches the page title
|
111
|
-
if elem_is_heading?(name)
|
112
|
-
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
113
|
-
end
|
114
|
-
|
115
|
-
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
116
|
-
|
117
|
-
VACANCY_TITLE_REGEX.match(content) {
|
118
|
-
if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
|
119
|
-
next_content = Cleaner.strip_string(elem.next_element.content)
|
120
|
-
title_scorer.store(next_content, 30).if_block_true {
|
121
|
-
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
122
|
-
}
|
123
|
-
end
|
124
|
-
}
|
125
|
-
end
|
126
|
-
|
127
|
-
title_scorer.top_match.strip.gsub(nbsp, "")
|
51
|
+
Facets::Title.new(@doc, @url, @pplain_text).parse
|
128
52
|
end
|
129
53
|
|
130
54
|
def apply_link
|
131
|
-
|
132
|
-
anchor_elements.each do |anchor|
|
133
|
-
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
134
|
-
end
|
135
|
-
if link
|
136
|
-
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
137
|
-
else
|
138
|
-
@url
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
|
143
|
-
def elem_is_heading?(name)
|
144
|
-
%w{h1 h2 h3 h4 h5}.include?(name)
|
145
|
-
end
|
146
|
-
|
147
|
-
def heading_elements
|
148
|
-
elements.select { |elem| elem_is_heading?(elem) }
|
149
|
-
end
|
150
|
-
|
151
|
-
def anchor_elements
|
152
|
-
elements["a"]
|
153
|
-
end
|
154
|
-
|
155
|
-
def elements
|
156
|
-
{}.tap do |response|
|
157
|
-
ACCEPTED_ELEMENTS.each do |elem|
|
158
|
-
response[elem] = doc.css(elem).to_a
|
159
|
-
end
|
160
|
-
end
|
55
|
+
Facets::Apply.new(@doc, @url, @plain_text).parse
|
161
56
|
end
|
162
57
|
end
|
163
58
|
end
|
data/lib/jobparser/regex.rb
CHANGED
@@ -9,6 +9,8 @@ module JobParser
|
|
9
9
|
APPLY_LINK_REGEX = /^apply|submit an application|application form/i
|
10
10
|
NBSP = Nokogiri::HTML(" ").text
|
11
11
|
LOCATION_REGEX = /(?:location: )([\D]*)$/i
|
12
|
+
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
13
|
+
CLEAN_SALARY_REGEX = /,|\s/
|
12
14
|
|
13
15
|
# words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
|
14
16
|
# could scope this regex just to headers
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
@@ -4,6 +4,12 @@ require "jobparser/parseurl"
|
|
4
4
|
require "jobparser/cleaner"
|
5
5
|
require "jobparser/scorer"
|
6
6
|
require "jobparser/specialcases"
|
7
|
+
require "jobparser/facets/facet"
|
8
|
+
require "jobparser/facets/salary"
|
9
|
+
require "jobparser/facets/salarystring"
|
10
|
+
require "jobparser/facets/location"
|
11
|
+
require "jobparser/facets/apply"
|
12
|
+
require "jobparser/facets/title"
|
7
13
|
|
8
14
|
module JobParser
|
9
15
|
# Your code goes here...
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -99,6 +99,12 @@ extensions: []
|
|
99
99
|
extra_rdoc_files: []
|
100
100
|
files:
|
101
101
|
- lib/jobparser/cleaner.rb
|
102
|
+
- lib/jobparser/facets/apply.rb
|
103
|
+
- lib/jobparser/facets/facet.rb
|
104
|
+
- lib/jobparser/facets/location.rb
|
105
|
+
- lib/jobparser/facets/salary.rb
|
106
|
+
- lib/jobparser/facets/salarystring.rb
|
107
|
+
- lib/jobparser/facets/title.rb
|
102
108
|
- lib/jobparser/parsehtml.rb
|
103
109
|
- lib/jobparser/parseurl.rb
|
104
110
|
- lib/jobparser/regex.rb
|