jobparser 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser/facets/salary.rb +0 -2
- data/lib/jobparser/parsehtml.rb +1 -37
- data/lib/jobparser/parser.rb +41 -0
- data/lib/jobparser/parseschema.rb +60 -0
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +2 -0
- metadata +4 -2
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -1,45 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require "nokogiri"
|
3
2
|
module JobParser
|
4
|
-
class ParseHtml
|
5
|
-
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
6
|
-
|
7
|
-
attr_reader :doc, :plain_text
|
8
|
-
|
9
|
-
def initialize(html, from_url)
|
10
|
-
@url = from_url
|
11
|
-
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
12
|
-
@doc.css("br").each { |br| br.replace "\n" }
|
13
|
-
@plain_text = get_plain_text
|
14
|
-
end
|
15
|
-
|
16
|
-
def job
|
17
|
-
{ :url => @url,
|
18
|
-
:salary => job_salary,
|
19
|
-
:title => job_title,
|
20
|
-
:apply => apply_link,
|
21
|
-
:salary_string => job_salary_string,
|
22
|
-
:location => job_location
|
23
|
-
}
|
24
|
-
end
|
3
|
+
class ParseHtml < Parser
|
25
4
|
|
26
5
|
private
|
27
6
|
|
28
|
-
def strip_bad_elements(doc)
|
29
|
-
blacklist = ['script', 'style', 'button']
|
30
|
-
blacklist.each do |tag|
|
31
|
-
doc.xpath("//#{tag}").remove
|
32
|
-
end
|
33
|
-
doc
|
34
|
-
end
|
35
|
-
|
36
|
-
def get_plain_text
|
37
|
-
doc = @doc.dup
|
38
|
-
blacklist = ['title', 'script', 'style', 'button']
|
39
|
-
nodelist = doc.search('//text()')
|
40
|
-
nodelist.text
|
41
|
-
end
|
42
|
-
|
43
7
|
def job_location
|
44
8
|
Facets::Location.new(@doc, @url, @plain_text).parse
|
45
9
|
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
module JobParser
|
3
|
+
class Parser
|
4
|
+
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
5
|
+
attr_reader :doc, :plain_text
|
6
|
+
|
7
|
+
def initialize(html, from_url)
|
8
|
+
@url = from_url
|
9
|
+
@doc = strip_bad_elements(Nokogiri::HTML(html))
|
10
|
+
@plain_text = get_plain_text
|
11
|
+
end
|
12
|
+
|
13
|
+
def job
|
14
|
+
{ :url => @url,
|
15
|
+
:salary => job_salary,
|
16
|
+
:title => job_title,
|
17
|
+
:apply => apply_link,
|
18
|
+
:salary_string => job_salary_string,
|
19
|
+
:location => job_location
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def strip_bad_elements(doc)
|
26
|
+
blacklist = ['script', 'style', 'button']
|
27
|
+
blacklist.each do |tag|
|
28
|
+
doc.xpath("//#{tag}").remove
|
29
|
+
end
|
30
|
+
doc.css("br").each { |br| br.replace "\n" }
|
31
|
+
doc
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_plain_text
|
35
|
+
doc = @doc.dup
|
36
|
+
blacklist = ['title', 'script', 'style', 'button']
|
37
|
+
nodelist = doc.search('//text()')
|
38
|
+
nodelist.text
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module JobParser
|
3
|
+
class ParseSchema < Parser
|
4
|
+
private
|
5
|
+
|
6
|
+
def job_salary
|
7
|
+
salary = job_salary_string
|
8
|
+
SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
|
9
|
+
[match[1].to_i, match[2].to_i]
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def job_title
|
14
|
+
get_content_at_prop("title")
|
15
|
+
end
|
16
|
+
|
17
|
+
def apply_link
|
18
|
+
Facets::Apply.new(@doc, @url, @plain_text).parse
|
19
|
+
end
|
20
|
+
|
21
|
+
def job_salary_string
|
22
|
+
get_content_at_prop("baseSalary")
|
23
|
+
end
|
24
|
+
|
25
|
+
def job_location
|
26
|
+
# some sites don't use the address stuff properly
|
27
|
+
if is_content_at_prop?("addressLocality")
|
28
|
+
get_content_at_prop("addressLocality")
|
29
|
+
else
|
30
|
+
get_content_at_prop("jobLocation")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def does_use_schema?
|
35
|
+
@doc.css("*").any? { |elem|
|
36
|
+
elem['itemtype'] == "http://schema.org/JobPosting"
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_content_at_prop(prop)
|
41
|
+
elem = find_with_itemprop(prop)
|
42
|
+
if elem
|
43
|
+
Cleaner.strip_all_white_space(find_with_itemprop(prop).content)
|
44
|
+
else
|
45
|
+
""
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def is_content_at_prop?(prop)
|
50
|
+
elem = find_with_itemprop("prop")
|
51
|
+
elem && !elem.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_with_itemprop(prop)
|
55
|
+
@doc.css("*").select { |elem|
|
56
|
+
elem['itemprop'] == prop
|
57
|
+
}.first
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -106,6 +106,8 @@ files:
|
|
106
106
|
- lib/jobparser/facets/salarystring.rb
|
107
107
|
- lib/jobparser/facets/title.rb
|
108
108
|
- lib/jobparser/parsehtml.rb
|
109
|
+
- lib/jobparser/parser.rb
|
110
|
+
- lib/jobparser/parseschema.rb
|
109
111
|
- lib/jobparser/parseurl.rb
|
110
112
|
- lib/jobparser/regex.rb
|
111
113
|
- lib/jobparser/scorer.rb
|