jobparser 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,14 +6,23 @@ module JobParser
|
|
6
6
|
return special_case_result unless special_case_result.nil?
|
7
7
|
|
8
8
|
salary = ""
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
SALARY_REGEX.match(@plain_text) { |m|
|
10
|
+
salary = m.to_s
|
11
|
+
}
|
12
|
+
final_salary = nil
|
13
|
+
if salary && !salary.empty?
|
14
|
+
p "got slaary"
|
15
|
+
p salary
|
16
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
17
|
+
final_salary = [match[1].to_i, match[2].to_i]
|
18
|
+
}
|
19
|
+
else
|
20
|
+
SALARY_UP_TO_REGEX.match(@plain_text) { |m|
|
21
|
+
final_salary = [nil, m[3].gsub(",","").to_i]
|
12
22
|
}
|
13
23
|
end
|
14
|
-
|
15
|
-
|
16
|
-
} || nil
|
24
|
+
|
25
|
+
final_salary
|
17
26
|
end
|
18
27
|
end
|
19
28
|
end
|
@@ -6,11 +6,12 @@ module JobParser
|
|
6
6
|
return special_case_result unless special_case_result.nil?
|
7
7
|
|
8
8
|
salary = ""
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
}
|
9
|
+
SALARY_STRING_REGEX.match(@plain_text) { |m| salary = m.to_s }
|
10
|
+
|
11
|
+
if salary.empty?
|
12
|
+
SALARY_UP_TO_REGEX.match(@plain_text) { |m| salary = m.to_s }
|
13
13
|
end
|
14
|
+
|
14
15
|
Cleaner.strip_all_white_space(salary)
|
15
16
|
end
|
16
17
|
end
|
@@ -7,13 +7,15 @@ module JobParser
|
|
7
7
|
|
8
8
|
title_scorer = Scorer.new
|
9
9
|
page_title = @doc.at_css("title").content
|
10
|
-
title_scorer.
|
10
|
+
title_scorer.store_and_score(page_title, 20)
|
11
11
|
|
12
12
|
# first see if we find something with a matching id
|
13
13
|
loop_over_elements do |name, elem|
|
14
|
-
# check the ID of the elements for matches
|
15
14
|
next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
15
|
+
|
16
16
|
content = Cleaner.strip_all_white_space(elem.content)
|
17
|
+
|
18
|
+
# does the element have an id that means it might store the title?
|
17
19
|
title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
18
20
|
|
19
21
|
# or if a heading element matches the page title
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -4,13 +4,13 @@ module JobParser
|
|
4
4
|
class ParseHtml
|
5
5
|
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
6
|
|
7
|
-
attr_reader :doc
|
7
|
+
attr_reader :doc, :plain_text
|
8
8
|
|
9
9
|
def initialize(html, from_url)
|
10
10
|
@url = from_url
|
11
|
-
@doc = Nokogiri::HTML(html)
|
11
|
+
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
12
12
|
@doc.css("br").each { |br| br.replace "\n" }
|
13
|
-
@plain_text =
|
13
|
+
@plain_text = get_plain_text
|
14
14
|
end
|
15
15
|
|
16
16
|
def job
|
@@ -25,13 +25,18 @@ module JobParser
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
-
def
|
28
|
+
def strip_bad_elements(doc)
|
29
|
+
blacklist = ['script', 'style', 'button']
|
30
|
+
blacklist.each do |tag|
|
31
|
+
doc.xpath("//#{tag}").remove
|
32
|
+
end
|
33
|
+
doc
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_plain_text
|
29
37
|
doc = @doc.dup
|
30
38
|
blacklist = ['title', 'script', 'style', 'button']
|
31
39
|
nodelist = doc.search('//text()')
|
32
|
-
blacklist.each do |tag|
|
33
|
-
nodelist -= doc.search('//' + tag + '/text()')
|
34
|
-
end
|
35
40
|
nodelist.text
|
36
41
|
end
|
37
42
|
|
data/lib/jobparser/regex.rb
CHANGED
@@ -3,6 +3,7 @@ require 'nokogiri'
|
|
3
3
|
module JobParser
|
4
4
|
SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
|
5
5
|
SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
|
6
|
+
SALARY_UP_TO_REGEX = /(up to)(.+)£([\d,]*)/
|
6
7
|
SALARY_TITLE_REGEX = /salary|\srate/i
|
7
8
|
VACANCY_TITLE_REGEX = /vacancy|job title/i
|
8
9
|
JOB_TITLE_ID_REGEX = /job(.?)title|title/i
|
data/lib/jobparser/scorer.rb
CHANGED
data/lib/jobparser/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|