jobparser 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/facets/salary.rb +0 -2
- data/lib/jobparser/parsehtml.rb +1 -37
- data/lib/jobparser/parser.rb +41 -0
- data/lib/jobparser/parseschema.rb +60 -0
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +2 -0
- metadata +4 -2
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -1,45 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require "nokogiri"
|
3
2
|
module JobParser
|
4
|
-
class ParseHtml
|
5
|
-
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
|
-
|
7
|
-
attr_reader :doc, :plain_text
|
8
|
-
|
9
|
-
def initialize(html, from_url)
|
10
|
-
@url = from_url
|
11
|
-
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
12
|
-
@doc.css("br").each { |br| br.replace "\n" }
|
13
|
-
@plain_text = get_plain_text
|
14
|
-
end
|
15
|
-
|
16
|
-
def job
|
17
|
-
{ :url => @url,
|
18
|
-
:salary => job_salary,
|
19
|
-
:title => job_title,
|
20
|
-
:apply => apply_link,
|
21
|
-
:salary_string => job_salary_string,
|
22
|
-
:location => job_location
|
23
|
-
}
|
24
|
-
end
|
3
|
+
class ParseHtml < Parser
|
25
4
|
|
26
5
|
private
|
27
6
|
|
28
|
-
def strip_bad_elements(doc)
|
29
|
-
blacklist = ['script', 'style', 'button']
|
30
|
-
blacklist.each do |tag|
|
31
|
-
doc.xpath("//#{tag}").remove
|
32
|
-
end
|
33
|
-
doc
|
34
|
-
end
|
35
|
-
|
36
|
-
def get_plain_text
|
37
|
-
doc = @doc.dup
|
38
|
-
blacklist = ['title', 'script', 'style', 'button']
|
39
|
-
nodelist = doc.search('//text()')
|
40
|
-
nodelist.text
|
41
|
-
end
|
42
|
-
|
43
7
|
def job_location
|
44
8
|
Facets::Location.new(@doc, @url, @plain_text).parse
|
45
9
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
module JobParser
|
3
|
+
class Parser
|
4
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
5
|
+
attr_reader :doc, :plain_text
|
6
|
+
|
7
|
+
def initialize(html, from_url)
|
8
|
+
@url = from_url
|
9
|
+
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
10
|
+
@plain_text = get_plain_text
|
11
|
+
end
|
12
|
+
|
13
|
+
def job
|
14
|
+
{ :url => @url,
|
15
|
+
:salary => job_salary,
|
16
|
+
:title => job_title,
|
17
|
+
:apply => apply_link,
|
18
|
+
:salary_string => job_salary_string,
|
19
|
+
:location => job_location
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def strip_bad_elements(doc)
|
26
|
+
blacklist = ['script', 'style', 'button']
|
27
|
+
blacklist.each do |tag|
|
28
|
+
doc.xpath("//#{tag}").remove
|
29
|
+
end
|
30
|
+
doc.css("br").each { |br| br.replace "\n" }
|
31
|
+
doc
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_plain_text
|
35
|
+
doc = @doc.dup
|
36
|
+
blacklist = ['title', 'script', 'style', 'button']
|
37
|
+
nodelist = doc.search('//text()')
|
38
|
+
nodelist.text
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module JobParser
|
3
|
+
class ParseSchema < Parser
|
4
|
+
private
|
5
|
+
|
6
|
+
def job_salary
|
7
|
+
salary = job_salary_string
|
8
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
9
|
+
[match[1].to_i, match[2].to_i]
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def job_title
|
14
|
+
get_content_at_prop("title")
|
15
|
+
end
|
16
|
+
|
17
|
+
def apply_link
|
18
|
+
Facets::Apply.new(@doc, @url, @plain_text).parse
|
19
|
+
end
|
20
|
+
|
21
|
+
def job_salary_string
|
22
|
+
get_content_at_prop("baseSalary")
|
23
|
+
end
|
24
|
+
|
25
|
+
def job_location
|
26
|
+
# some sites don't use the address stuff properly
|
27
|
+
if is_content_at_prop?("addressLocality")
|
28
|
+
get_content_at_prop("addressLocality")
|
29
|
+
else
|
30
|
+
get_content_at_prop("jobLocation")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def does_use_schema?
|
35
|
+
@doc.css("*").any? { |elem|
|
36
|
+
elem['itemtype'] == "http://schema.org/JobPosting"
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_content_at_prop(prop)
|
41
|
+
elem = find_with_itemprop(prop)
|
42
|
+
if elem
|
43
|
+
Cleaner.strip_all_white_space(find_with_itemprop(prop).content)
|
44
|
+
else
|
45
|
+
""
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def is_content_at_prop?(prop)
|
50
|
+
elem = find_with_itemprop("prop")
|
51
|
+
elem && !elem.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_with_itemprop(prop)
|
55
|
+
@doc.css("*").select { |elem|
|
56
|
+
elem['itemprop'] == prop
|
57
|
+
}.first
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -106,6 +106,8 @@ files:
|
|
106
106
|
- lib/jobparser/facets/salarystring.rb
|
107
107
|
- lib/jobparser/facets/title.rb
|
108
108
|
- lib/jobparser/parsehtml.rb
|
109
|
+
- lib/jobparser/parser.rb
|
110
|
+
- lib/jobparser/parseschema.rb
|
109
111
|
- lib/jobparser/parseurl.rb
|
110
112
|
- lib/jobparser/regex.rb
|
111
113
|
- lib/jobparser/scorer.rb
|