jobparser 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/cleaner.rb +6 -37
- data/lib/jobparser/facets/apply.rb +18 -0
- data/lib/jobparser/facets/facet.rb +41 -0
- data/lib/jobparser/facets/location.rb +15 -0
- data/lib/jobparser/facets/salary.rb +20 -0
- data/lib/jobparser/facets/salarystring.rb +18 -0
- data/lib/jobparser/facets/title.rb +46 -0
- data/lib/jobparser/parsehtml.rb +6 -111
- data/lib/jobparser/regex.rb +2 -0
- data/lib/jobparser/specialcases.rb +1 -1
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +6 -0
- metadata +8 -2
data/lib/jobparser/cleaner.rb
CHANGED
@@ -5,30 +5,16 @@ require "jobparser/regex"
|
|
5
5
|
module JobParser
|
6
6
|
class Cleaner
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def initialize(ary, opts = {})
|
12
|
-
@subject = ary
|
13
|
-
@type = opts[:type]
|
14
|
-
end
|
15
|
-
|
16
|
-
def clean
|
17
|
-
clean_array
|
8
|
+
def self.strip_all_white_space(str)
|
9
|
+
Cleaner.clean_text(Cleaner.remove_nbsp(str.gsub('\n', ''))).strip
|
18
10
|
end
|
19
11
|
|
20
|
-
def self.
|
21
|
-
|
22
|
-
[match[1].to_i, match[2].to_i]
|
23
|
-
}
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.strip_string(str)
|
27
|
-
str.gsub('/n', '').gsub(NBSP, '').strip
|
12
|
+
def self.clean_text(str)
|
13
|
+
str.gsub(/\r|\t/, "").gsub(NBSP, " ")
|
28
14
|
end
|
29
15
|
|
30
|
-
def self.
|
31
|
-
str.gsub(
|
16
|
+
def self.remove_nbsp(str)
|
17
|
+
str.gsub(NBSP, "")
|
32
18
|
end
|
33
19
|
|
34
20
|
def self.make_link_absolute(url, href)
|
@@ -45,22 +31,5 @@ module JobParser
|
|
45
31
|
end
|
46
32
|
end
|
47
33
|
|
48
|
-
private
|
49
|
-
|
50
|
-
def clean_array
|
51
|
-
@subject.select { |item|
|
52
|
-
not_whitespace_or_empty(item)
|
53
|
-
}.map { |item|
|
54
|
-
clean_string(item) if item.is_a?(String)
|
55
|
-
}.uniq
|
56
|
-
end
|
57
|
-
|
58
|
-
def clean_string(str)
|
59
|
-
self.class.strip_string(str)
|
60
|
-
end
|
61
|
-
|
62
|
-
def not_whitespace_or_empty(item)
|
63
|
-
/^\s+$/.match(item) == nil && !item.empty?
|
64
|
-
end
|
65
34
|
end
|
66
35
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Apply < Facet
|
4
|
+
def parse
|
5
|
+
link = nil
|
6
|
+
elements["a"].each do |anchor|
|
7
|
+
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
8
|
+
end
|
9
|
+
if link
|
10
|
+
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
11
|
+
else
|
12
|
+
@url
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Facet
|
4
|
+
|
5
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
|
+
|
7
|
+
attr_reader :doc, :url, :plain_text
|
8
|
+
|
9
|
+
def initialize(doc, url, plain_text)
|
10
|
+
@doc = doc
|
11
|
+
@url = url
|
12
|
+
@plain_text = plain_text
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def use_special_case(name)
|
18
|
+
if special_case = SpecialCases.case_for_url(@url)
|
19
|
+
special_case[name].call(@doc)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def loop_over_elements(&block)
|
24
|
+
elements.each do |name, elems|
|
25
|
+
elems.each do |elem|
|
26
|
+
yield name, elem
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def elements
|
32
|
+
{}.tap do |response|
|
33
|
+
ACCEPTED_ELEMENTS.each do |elem|
|
34
|
+
response[elem] = @doc.css(elem).to_a
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Location < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:location)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
LOCATION_REGEX.match(Cleaner.clean_text(@plain_text)) { |m|
|
9
|
+
Cleaner.strip_all_white_space(m[1].to_s)
|
10
|
+
} || ""
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Salary < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:salary)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
salary = ""
|
9
|
+
loop_over_elements do |name, elem|
|
10
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
11
|
+
salary = m.to_s
|
12
|
+
}
|
13
|
+
end
|
14
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
15
|
+
[match[1].to_i, match[2].to_i]
|
16
|
+
} || nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class SalaryString < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:salary_string)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
salary = ""
|
9
|
+
loop_over_elements do |name, elem|
|
10
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
11
|
+
salary = m.to_s
|
12
|
+
}
|
13
|
+
end
|
14
|
+
Cleaner.strip_all_white_space(salary)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module JobParser
|
2
|
+
module Facets
|
3
|
+
class Title < Facet
|
4
|
+
def parse
|
5
|
+
special_case_result = use_special_case(:title)
|
6
|
+
return special_case_result unless special_case_result.nil?
|
7
|
+
|
8
|
+
title_scorer = Scorer.new
|
9
|
+
page_title = @doc.at_css("title").content
|
10
|
+
title_scorer.store(page_title, 20).and_score_now
|
11
|
+
|
12
|
+
# first see if we find something with a matching id
|
13
|
+
loop_over_elements do |name, elem|
|
14
|
+
# check the ID of the elements for matches
|
15
|
+
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
16
|
+
content = Cleaner.strip_all_white_space(elem.content)
|
17
|
+
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
18
|
+
|
19
|
+
# or if a heading element matches the page title
|
20
|
+
if elem_is_heading?(name)
|
21
|
+
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
22
|
+
end
|
23
|
+
|
24
|
+
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
25
|
+
|
26
|
+
VACANCY_TITLE_REGEX.match(content) {
|
27
|
+
if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
|
28
|
+
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
29
|
+
title_scorer.store(next_content, 30).if_block_true {
|
30
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
title_scorer.top_match.strip.gsub(NBSP, "")
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def elem_is_heading?(name)
|
42
|
+
%w{h1 h2 h3 h4 h5}.include?(name)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require 'jobparser/regex.rb'
|
3
2
|
require "nokogiri"
|
4
3
|
module JobParser
|
5
4
|
class ParseHtml
|
@@ -26,15 +25,6 @@ module JobParser
|
|
26
25
|
|
27
26
|
private
|
28
27
|
|
29
|
-
def job_location
|
30
|
-
special_case_result = use_special_case(:location)
|
31
|
-
return special_case_result unless special_case_result.nil?
|
32
|
-
|
33
|
-
LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
|
34
|
-
Cleaner.strip_string(m[1].to_s)
|
35
|
-
} || ""
|
36
|
-
end
|
37
|
-
|
38
28
|
def strip_html
|
39
29
|
doc = @doc.dup
|
40
30
|
blacklist = ['title', 'script', 'style', 'button']
|
@@ -45,119 +35,24 @@ module JobParser
|
|
45
35
|
nodelist.text
|
46
36
|
end
|
47
37
|
|
48
|
-
def
|
49
|
-
|
50
|
-
elems.each do |elem|
|
51
|
-
yield name, elem
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def clean_array(ary, type = nil)
|
57
|
-
Cleaner.new(ary, :type => type).clean
|
58
|
-
end
|
59
|
-
|
60
|
-
def use_special_case(name)
|
61
|
-
if special_case = SpecialCases.case_for_url(@url)
|
62
|
-
special_case[name].call(@doc)
|
63
|
-
end
|
38
|
+
def job_location
|
39
|
+
Facets::Location.new(@doc, @url, @plain_text).parse
|
64
40
|
end
|
65
41
|
|
66
42
|
def job_salary_string
|
67
|
-
|
68
|
-
return special_case_result unless special_case_result.nil?
|
69
|
-
|
70
|
-
salary = ""
|
71
|
-
loop_over_elements do |name, elem|
|
72
|
-
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
73
|
-
salary = m.to_s
|
74
|
-
}
|
75
|
-
end
|
76
|
-
Cleaner.strip_string(salary)
|
43
|
+
Facets::SalaryString.new(@doc, @url, @plain_text).parse
|
77
44
|
end
|
78
45
|
|
79
46
|
def job_salary
|
80
|
-
|
81
|
-
return special_case_result unless special_case_result.nil?
|
82
|
-
|
83
|
-
salary = ""
|
84
|
-
loop_over_elements do |name, elem|
|
85
|
-
SALARY_REGEX.match(@plain_text) { |m|
|
86
|
-
salary = m.to_s
|
87
|
-
}
|
88
|
-
end
|
89
|
-
salary.empty? ? nil : Cleaner.clean_salary(salary)
|
47
|
+
Facets::Salary.new(@doc, @url, @plain_text).parse
|
90
48
|
end
|
91
49
|
|
92
50
|
def job_title
|
93
|
-
|
94
|
-
return special_case_result unless special_case_result.nil?
|
95
|
-
|
96
|
-
title_scorer = Scorer.new
|
97
|
-
page_title = @doc.at_css("title").content
|
98
|
-
title_scorer.store(page_title, 20).and_score_now
|
99
|
-
|
100
|
-
# http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
|
101
|
-
nbsp = Nokogiri::HTML(" ").text
|
102
|
-
|
103
|
-
# first see if we find something with a matching id
|
104
|
-
loop_over_elements do |name, elem|
|
105
|
-
# check the ID of the elements for matches
|
106
|
-
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
107
|
-
content = Cleaner.strip_string(elem.content)
|
108
|
-
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
109
|
-
|
110
|
-
# or if a heading element matches the page title
|
111
|
-
if elem_is_heading?(name)
|
112
|
-
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
113
|
-
end
|
114
|
-
|
115
|
-
title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
116
|
-
|
117
|
-
VACANCY_TITLE_REGEX.match(content) {
|
118
|
-
if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
|
119
|
-
next_content = Cleaner.strip_string(elem.next_element.content)
|
120
|
-
title_scorer.store(next_content, 30).if_block_true {
|
121
|
-
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
122
|
-
}
|
123
|
-
end
|
124
|
-
}
|
125
|
-
end
|
126
|
-
|
127
|
-
title_scorer.top_match.strip.gsub(nbsp, "")
|
51
|
+
Facets::Title.new(@doc, @url, @pplain_text).parse
|
128
52
|
end
|
129
53
|
|
130
54
|
def apply_link
|
131
|
-
|
132
|
-
anchor_elements.each do |anchor|
|
133
|
-
APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
|
134
|
-
end
|
135
|
-
if link
|
136
|
-
Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
|
137
|
-
else
|
138
|
-
@url
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
|
143
|
-
def elem_is_heading?(name)
|
144
|
-
%w{h1 h2 h3 h4 h5}.include?(name)
|
145
|
-
end
|
146
|
-
|
147
|
-
def heading_elements
|
148
|
-
elements.select { |elem| elem_is_heading?(elem) }
|
149
|
-
end
|
150
|
-
|
151
|
-
def anchor_elements
|
152
|
-
elements["a"]
|
153
|
-
end
|
154
|
-
|
155
|
-
def elements
|
156
|
-
{}.tap do |response|
|
157
|
-
ACCEPTED_ELEMENTS.each do |elem|
|
158
|
-
response[elem] = doc.css(elem).to_a
|
159
|
-
end
|
160
|
-
end
|
55
|
+
Facets::Apply.new(@doc, @url, @plain_text).parse
|
161
56
|
end
|
162
57
|
end
|
163
58
|
end
|
data/lib/jobparser/regex.rb
CHANGED
@@ -9,6 +9,8 @@ module JobParser
|
|
9
9
|
APPLY_LINK_REGEX = /^apply|submit an application|application form/i
|
10
10
|
NBSP = Nokogiri::HTML(" ").text
|
11
11
|
LOCATION_REGEX = /(?:location: )([\D]*)$/i
|
12
|
+
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
13
|
+
CLEAN_SALARY_REGEX = /,|\s/
|
12
14
|
|
13
15
|
# words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
|
14
16
|
# could scope this regex just to headers
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
@@ -4,6 +4,12 @@ require "jobparser/parseurl"
|
|
4
4
|
require "jobparser/cleaner"
|
5
5
|
require "jobparser/scorer"
|
6
6
|
require "jobparser/specialcases"
|
7
|
+
require "jobparser/facets/facet"
|
8
|
+
require "jobparser/facets/salary"
|
9
|
+
require "jobparser/facets/salarystring"
|
10
|
+
require "jobparser/facets/location"
|
11
|
+
require "jobparser/facets/apply"
|
12
|
+
require "jobparser/facets/title"
|
7
13
|
|
8
14
|
module JobParser
|
9
15
|
# Your code goes here...
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -99,6 +99,12 @@ extensions: []
|
|
99
99
|
extra_rdoc_files: []
|
100
100
|
files:
|
101
101
|
- lib/jobparser/cleaner.rb
|
102
|
+
- lib/jobparser/facets/apply.rb
|
103
|
+
- lib/jobparser/facets/facet.rb
|
104
|
+
- lib/jobparser/facets/location.rb
|
105
|
+
- lib/jobparser/facets/salary.rb
|
106
|
+
- lib/jobparser/facets/salarystring.rb
|
107
|
+
- lib/jobparser/facets/title.rb
|
102
108
|
- lib/jobparser/parsehtml.rb
|
103
109
|
- lib/jobparser/parseurl.rb
|
104
110
|
- lib/jobparser/regex.rb
|