jobparser 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
@@ -6,14 +6,23 @@ module JobParser
|
|
6
6
|
return special_case_result unless special_case_result.nil?
|
7
7
|
|
8
8
|
salary = ""
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
10
|
+
salary = m.to_s
|
11
|
+
}
|
12
|
+
final_salary = nil
|
13
|
+
if salary && !salary.empty?
|
14
|
+
p "got slaary"
|
15
|
+
p salary
|
16
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
17
|
+
final_salary = [match[1].to_i, match[2].to_i]
|
18
|
+
}
|
19
|
+
else
|
20
|
+
SALARY_UP_TO_REGEX.match(@plain_text) { |m|
|
21
|
+
final_salary = [nil, m[3].gsub(",","").to_i]
|
12
22
|
}
|
13
23
|
end
|
14
|
-
|
15
|
-
|
16
|
-
} || nil
|
24
|
+
|
25
|
+
final_salary
|
17
26
|
end
|
18
27
|
end
|
19
28
|
end
|
@@ -6,11 +6,12 @@ module JobParser
|
|
6
6
|
return special_case_result unless special_case_result.nil?
|
7
7
|
|
8
8
|
salary = ""
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}
|
9
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m| salary = m.to_s }
|
10
|
+
|
11
|
+
if salary.empty?
|
12
|
+
SALARY_UP_TO_REGEX.match(@plain_text) { |m| salary = m.to_s }
|
13
13
|
end
|
14
|
+
|
14
15
|
Cleaner.strip_all_white_space(salary)
|
15
16
|
end
|
16
17
|
end
|
@@ -7,13 +7,15 @@ module JobParser
|
|
7
7
|
|
8
8
|
title_scorer = Scorer.new
|
9
9
|
page_title = @doc.at_css("title").content
|
10
|
-
title_scorer.
|
10
|
+
title_scorer.store_and_score(page_title, 20)
|
11
11
|
|
12
12
|
# first see if we find something with a matching id
|
13
13
|
loop_over_elements do |name, elem|
|
14
|
-
# check the ID of the elements for matches
|
15
14
|
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
15
|
+
|
16
16
|
content = Cleaner.strip_all_white_space(elem.content)
|
17
|
+
|
18
|
+
# does the element have an id that means it might store the title?
|
17
19
|
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
18
20
|
|
19
21
|
# or if a heading element matches the page title
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -4,13 +4,13 @@ module JobParser
|
|
4
4
|
class ParseHtml
|
5
5
|
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
6
|
|
7
|
-
attr_reader :doc
|
7
|
+
attr_reader :doc, :plain_text
|
8
8
|
|
9
9
|
def initialize(html, from_url)
|
10
10
|
@url = from_url
|
11
|
-
@doc = Nokogiri::HTML(html)
|
11
|
+
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
12
12
|
@doc.css("br").each { |br| br.replace "\n" }
|
13
|
-
@plain_text =
|
13
|
+
@plain_text = get_plain_text
|
14
14
|
end
|
15
15
|
|
16
16
|
def job
|
@@ -25,13 +25,18 @@ module JobParser
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
-
def
|
28
|
+
def strip_bad_elements(doc)
|
29
|
+
blacklist = ['script', 'style', 'button']
|
30
|
+
blacklist.each do |tag|
|
31
|
+
doc.xpath("//#{tag}").remove
|
32
|
+
end
|
33
|
+
doc
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_plain_text
|
29
37
|
doc = @doc.dup
|
30
38
|
blacklist = ['title', 'script', 'style', 'button']
|
31
39
|
nodelist = doc.search('//text()')
|
32
|
-
blacklist.each do |tag|
|
33
|
-
nodelist -= doc.search('//' + tag + '/text()')
|
34
|
-
end
|
35
40
|
nodelist.text
|
36
41
|
end
|
37
42
|
|
data/lib/jobparser/regex.rb
CHANGED
@@ -3,6 +3,7 @@ require 'nokogiri'
|
|
3
3
|
module JobParser
|
4
4
|
SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
|
5
5
|
SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
|
6
|
+
SALARY_UP_TO_REGEX = /(up to)(.+)£([\d,]*)/
|
6
7
|
SALARY_TITLE_REGEX = /salary|\srate/i
|
7
8
|
VACANCY_TITLE_REGEX = /vacancy|job title/i
|
8
9
|
JOB_TITLE_ID_REGEX = /job(.?)title|title/i
|
data/lib/jobparser/scorer.rb
CHANGED
data/lib/jobparser/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|