jobparser 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,8 +11,6 @@ module JobParser
11
11
  }
12
12
  final_salary = nil
13
13
  if salary && !salary.empty?
14
- p "got slaary"
15
- p salary
16
14
  SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
17
15
  final_salary = [match[1].to_i, match[2].to_i]
18
16
  }
@@ -1,45 +1,9 @@
1
1
  # encoding: utf-8
2
- require "nokogiri"
3
2
  module JobParser
4
- class ParseHtml
5
- ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
-
7
- attr_reader :doc, :plain_text
8
-
9
- def initialize(html, from_url)
10
- @url = from_url
11
- @doc = strip_bad_elements(Nokogiri::HTML(html))
12
- @doc.css("br").each { |br| br.replace "\n" }
13
- @plain_text = get_plain_text
14
- end
15
-
16
- def job
17
- { :url => @url,
18
- :salary => job_salary,
19
- :title => job_title,
20
- :apply => apply_link,
21
- :salary_string => job_salary_string,
22
- :location => job_location
23
- }
24
- end
3
+ class ParseHtml < Parser
25
4
 
26
5
  private
27
6
 
28
- def strip_bad_elements(doc)
29
- blacklist = ['script', 'style', 'button']
30
- blacklist.each do |tag|
31
- doc.xpath("//#{tag}").remove
32
- end
33
- doc
34
- end
35
-
36
- def get_plain_text
37
- doc = @doc.dup
38
- blacklist = ['title', 'script', 'style', 'button']
39
- nodelist = doc.search('//text()')
40
- nodelist.text
41
- end
42
-
43
7
  def job_location
44
8
  Facets::Location.new(@doc, @url, @plain_text).parse
45
9
  end
@@ -0,0 +1,41 @@
1
+ require "nokogiri"
2
+ module JobParser
3
+ class Parser
4
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
5
+ attr_reader :doc, :plain_text
6
+
7
+ def initialize(html, from_url)
8
+ @url = from_url
9
+ @doc = strip_bad_elements(Nokogiri::HTML(html))
10
+ @plain_text = get_plain_text
11
+ end
12
+
13
+ def job
14
+ { :url => @url,
15
+ :salary => job_salary,
16
+ :title => job_title,
17
+ :apply => apply_link,
18
+ :salary_string => job_salary_string,
19
+ :location => job_location
20
+ }
21
+ end
22
+
23
+ private
24
+
25
+ def strip_bad_elements(doc)
26
+ blacklist = ['script', 'style', 'button']
27
+ blacklist.each do |tag|
28
+ doc.xpath("//#{tag}").remove
29
+ end
30
+ doc.css("br").each { |br| br.replace "\n" }
31
+ doc
32
+ end
33
+
34
+ def get_plain_text
35
+ doc = @doc.dup
36
+ blacklist = ['title', 'script', 'style', 'button']
37
+ nodelist = doc.search('//text()')
38
+ nodelist.text
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module JobParser
3
+ class ParseSchema < Parser
4
+ private
5
+
6
+ def job_salary
7
+ salary = job_salary_string
8
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
9
+ [match[1].to_i, match[2].to_i]
10
+ }
11
+ end
12
+
13
+ def job_title
14
+ get_content_at_prop("title")
15
+ end
16
+
17
+ def apply_link
18
+ Facets::Apply.new(@doc, @url, @plain_text).parse
19
+ end
20
+
21
+ def job_salary_string
22
+ get_content_at_prop("baseSalary")
23
+ end
24
+
25
+ def job_location
26
+ # some sites don't use the address stuff properly
27
+ if is_content_at_prop?("addressLocality")
28
+ get_content_at_prop("addressLocality")
29
+ else
30
+ get_content_at_prop("jobLocation")
31
+ end
32
+ end
33
+
34
+ def does_use_schema?
35
+ @doc.css("*").any? { |elem|
36
+ elem['itemtype'] == "http://schema.org/JobPosting"
37
+ }
38
+ end
39
+
40
+ def get_content_at_prop(prop)
41
+ elem = find_with_itemprop(prop)
42
+ if elem
43
+ Cleaner.strip_all_white_space(find_with_itemprop(prop).content)
44
+ else
45
+ ""
46
+ end
47
+ end
48
+
49
+ def is_content_at_prop?(prop)
50
+ elem = find_with_itemprop("prop")
51
+ elem && !elem.empty?
52
+ end
53
+
54
+ def find_with_itemprop(prop)
55
+ @doc.css("*").select { |elem|
56
+ elem['itemprop'] == prop
57
+ }.first
58
+ end
59
+ end
60
+ end
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  require "jobparser/version"
2
+ require "jobparser/parser"
2
3
  require "jobparser/parsehtml"
4
+ require "jobparser/parseschema"
3
5
  require "jobparser/parseurl"
4
6
  require "jobparser/cleaner"
5
7
  require "jobparser/scorer"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-07-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -106,6 +106,8 @@ files:
106
106
  - lib/jobparser/facets/salarystring.rb
107
107
  - lib/jobparser/facets/title.rb
108
108
  - lib/jobparser/parsehtml.rb
109
+ - lib/jobparser/parser.rb
110
+ - lib/jobparser/parseschema.rb
109
111
  - lib/jobparser/parseurl.rb
110
112
  - lib/jobparser/regex.rb
111
113
  - lib/jobparser/scorer.rb