jobparser 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/cleaner.rb +4 -1
- data/lib/jobparser/parsehtml.rb +19 -1
- data/lib/jobparser/specialcases.rb +56 -0
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +1 -0
- metadata +3 -2
data/lib/jobparser/cleaner.rb
CHANGED
@@ -7,7 +7,6 @@ module JobParser
|
|
7
7
|
|
8
8
|
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
9
9
|
CLEAN_SALARY_REGEX = /,|\s/
|
10
|
-
NBSP = Nokogiri::HTML(" ").text
|
11
10
|
|
12
11
|
def initialize(ary, opts = {})
|
13
12
|
@subject = ary
|
@@ -28,6 +27,10 @@ module JobParser
|
|
28
27
|
str.gsub('/n', '').gsub(NBSP, '').strip
|
29
28
|
end
|
30
29
|
|
30
|
+
def self.clean_plain_text(str)
|
31
|
+
str.gsub(/\r|\t/, "").gsub(NBSP, " ")
|
32
|
+
end
|
33
|
+
|
31
34
|
def self.make_link_absolute(url, href)
|
32
35
|
if href.include?("http")
|
33
36
|
href
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -27,7 +27,10 @@ module JobParser
|
|
27
27
|
private
|
28
28
|
|
29
29
|
def job_location
|
30
|
-
|
30
|
+
special_case_result = use_special_case(:location)
|
31
|
+
return special_case_result unless special_case_result.nil?
|
32
|
+
|
33
|
+
LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
|
31
34
|
Cleaner.strip_string(m[1].to_s)
|
32
35
|
} || ""
|
33
36
|
end
|
@@ -54,7 +57,16 @@ module JobParser
|
|
54
57
|
Cleaner.new(ary, :type => type).clean
|
55
58
|
end
|
56
59
|
|
60
|
+
def use_special_case(name)
|
61
|
+
if special_case = SpecialCases.case_for_url(@url)
|
62
|
+
special_case[name].call(@doc)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
57
66
|
def job_salary_string
|
67
|
+
special_case_result = use_special_case(:salary_string)
|
68
|
+
return special_case_result unless special_case_result.nil?
|
69
|
+
|
58
70
|
salary = ""
|
59
71
|
loop_over_elements do |name, elem|
|
60
72
|
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
@@ -65,6 +77,9 @@ module JobParser
|
|
65
77
|
end
|
66
78
|
|
67
79
|
def job_salary
|
80
|
+
special_case_result = use_special_case(:salary)
|
81
|
+
return special_case_result unless special_case_result.nil?
|
82
|
+
|
68
83
|
salary = ""
|
69
84
|
loop_over_elements do |name, elem|
|
70
85
|
SALARY_REGEX.match(@plain_text) { |m|
|
@@ -75,6 +90,9 @@ module JobParser
|
|
75
90
|
end
|
76
91
|
|
77
92
|
def job_title
|
93
|
+
special_case_result = use_special_case(:title)
|
94
|
+
return special_case_result unless special_case_result.nil?
|
95
|
+
|
78
96
|
title_scorer = Scorer.new
|
79
97
|
page_title = @doc.at_css("title").content
|
80
98
|
title_scorer.store(page_title, 20).and_score_now
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module JobParser
|
3
|
+
class SpecialCases
|
4
|
+
def self.dictionary
|
5
|
+
{
|
6
|
+
"jobsearch.direct.gov.uk" => {
|
7
|
+
:title => Proc.new { |doc| doc.css(".jobViewContent h2")[1].content },
|
8
|
+
:location => Proc.new { |doc|
|
9
|
+
location = ""
|
10
|
+
listings = doc.css(".jobViewSummary dl dt")
|
11
|
+
listings.each do |dt|
|
12
|
+
if dt.content == "Location"
|
13
|
+
location = dt.next_element.content
|
14
|
+
break
|
15
|
+
end
|
16
|
+
end
|
17
|
+
location
|
18
|
+
},
|
19
|
+
:salary_string => Proc.new { |doc|
|
20
|
+
salary = ""
|
21
|
+
listings = doc.css(".jobViewSummary dl dt")
|
22
|
+
listings.each do |dt|
|
23
|
+
if dt.content == "Salary"
|
24
|
+
salary = Cleaner.strip_string(dt.next_element.content)
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
salary
|
29
|
+
},
|
30
|
+
:salary => Proc.new { |doc|
|
31
|
+
# get string by calling salary_string special case
|
32
|
+
salary = nil
|
33
|
+
salary_string = SpecialCases.call_special_case("jobsearch.direct.gov.uk", :salary_string, doc)
|
34
|
+
/£?([0-9,\.]+)\D*£?([0-9,\.]+)/.match(salary_string) { |m|
|
35
|
+
low = m[1].gsub(",", "").to_i
|
36
|
+
high = m[2].gsub(",", "").to_i
|
37
|
+
salary = [low, high]
|
38
|
+
}
|
39
|
+
salary
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.call_special_case(key, method, doc)
|
46
|
+
self.dictionary[key][method].call(doc)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.case_for_url(url)
|
50
|
+
self.dictionary.keys.each do |key|
|
51
|
+
return self.dictionary[key] if url.include?(key)
|
52
|
+
end
|
53
|
+
false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -103,6 +103,7 @@ files:
|
|
103
103
|
- lib/jobparser/parseurl.rb
|
104
104
|
- lib/jobparser/regex.rb
|
105
105
|
- lib/jobparser/scorer.rb
|
106
|
+
- lib/jobparser/specialcases.rb
|
106
107
|
- lib/jobparser/version.rb
|
107
108
|
- lib/jobparser.rb
|
108
109
|
homepage: ''
|