jobparser 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser/cleaner.rb +4 -1
- data/lib/jobparser/parsehtml.rb +19 -1
- data/lib/jobparser/specialcases.rb +56 -0
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +1 -0
- metadata +3 -2
data/lib/jobparser/cleaner.rb
CHANGED
@@ -7,7 +7,6 @@ module JobParser
|
|
7
7
|
|
8
8
|
SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
|
9
9
|
CLEAN_SALARY_REGEX = /,|\s/
|
10
|
-
NBSP = Nokogiri::HTML(" ").text
|
11
10
|
|
12
11
|
def initialize(ary, opts = {})
|
13
12
|
@subject = ary
|
@@ -28,6 +27,10 @@ module JobParser
|
|
28
27
|
str.gsub('/n', '').gsub(NBSP, '').strip
|
29
28
|
end
|
30
29
|
|
30
|
+
def self.clean_plain_text(str)
|
31
|
+
str.gsub(/\r|\t/, "").gsub(NBSP, " ")
|
32
|
+
end
|
33
|
+
|
31
34
|
def self.make_link_absolute(url, href)
|
32
35
|
if href.include?("http")
|
33
36
|
href
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -27,7 +27,10 @@ module JobParser
|
|
27
27
|
private
|
28
28
|
|
29
29
|
def job_location
|
30
|
-
|
30
|
+
special_case_result = use_special_case(:location)
|
31
|
+
return special_case_result unless special_case_result.nil?
|
32
|
+
|
33
|
+
LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
|
31
34
|
Cleaner.strip_string(m[1].to_s)
|
32
35
|
} || ""
|
33
36
|
end
|
@@ -54,7 +57,16 @@ module JobParser
|
|
54
57
|
Cleaner.new(ary, :type => type).clean
|
55
58
|
end
|
56
59
|
|
60
|
+
def use_special_case(name)
|
61
|
+
if special_case = SpecialCases.case_for_url(@url)
|
62
|
+
special_case[name].call(@doc)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
57
66
|
def job_salary_string
|
67
|
+
special_case_result = use_special_case(:salary_string)
|
68
|
+
return special_case_result unless special_case_result.nil?
|
69
|
+
|
58
70
|
salary = ""
|
59
71
|
loop_over_elements do |name, elem|
|
60
72
|
SALARY_STRING_REGEX.match(@plain_text) { |m|
|
@@ -65,6 +77,9 @@ module JobParser
|
|
65
77
|
end
|
66
78
|
|
67
79
|
def job_salary
|
80
|
+
special_case_result = use_special_case(:salary)
|
81
|
+
return special_case_result unless special_case_result.nil?
|
82
|
+
|
68
83
|
salary = ""
|
69
84
|
loop_over_elements do |name, elem|
|
70
85
|
SALARY_REGEX.match(@plain_text) { |m|
|
@@ -75,6 +90,9 @@ module JobParser
|
|
75
90
|
end
|
76
91
|
|
77
92
|
def job_title
|
93
|
+
special_case_result = use_special_case(:title)
|
94
|
+
return special_case_result unless special_case_result.nil?
|
95
|
+
|
78
96
|
title_scorer = Scorer.new
|
79
97
|
page_title = @doc.at_css("title").content
|
80
98
|
title_scorer.store(page_title, 20).and_score_now
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module JobParser
|
3
|
+
class SpecialCases
|
4
|
+
def self.dictionary
|
5
|
+
{
|
6
|
+
"jobsearch.direct.gov.uk" => {
|
7
|
+
:title => Proc.new { |doc| doc.css(".jobViewContent h2")[1].content },
|
8
|
+
:location => Proc.new { |doc|
|
9
|
+
location = ""
|
10
|
+
listings = doc.css(".jobViewSummary dl dt")
|
11
|
+
listings.each do |dt|
|
12
|
+
if dt.content == "Location"
|
13
|
+
location = dt.next_element.content
|
14
|
+
break
|
15
|
+
end
|
16
|
+
end
|
17
|
+
location
|
18
|
+
},
|
19
|
+
:salary_string => Proc.new { |doc|
|
20
|
+
salary = ""
|
21
|
+
listings = doc.css(".jobViewSummary dl dt")
|
22
|
+
listings.each do |dt|
|
23
|
+
if dt.content == "Salary"
|
24
|
+
salary = Cleaner.strip_string(dt.next_element.content)
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
salary
|
29
|
+
},
|
30
|
+
:salary => Proc.new { |doc|
|
31
|
+
# get string by calling salary_string special case
|
32
|
+
salary = nil
|
33
|
+
salary_string = SpecialCases.call_special_case("jobsearch.direct.gov.uk", :salary_string, doc)
|
34
|
+
/£?([0-9,\.]+)\D*£?([0-9,\.]+)/.match(salary_string) { |m|
|
35
|
+
low = m[1].gsub(",", "").to_i
|
36
|
+
high = m[2].gsub(",", "").to_i
|
37
|
+
salary = [low, high]
|
38
|
+
}
|
39
|
+
salary
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.call_special_case(key, method, doc)
|
46
|
+
self.dictionary[key][method].call(doc)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.case_for_url(url)
|
50
|
+
self.dictionary.keys.each do |key|
|
51
|
+
return self.dictionary[key] if url.include?(key)
|
52
|
+
end
|
53
|
+
false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -103,6 +103,7 @@ files:
|
|
103
103
|
- lib/jobparser/parseurl.rb
|
104
104
|
- lib/jobparser/regex.rb
|
105
105
|
- lib/jobparser/scorer.rb
|
106
|
+
- lib/jobparser/specialcases.rb
|
106
107
|
- lib/jobparser/version.rb
|
107
108
|
- lib/jobparser.rb
|
108
109
|
homepage: ''
|