jobparser 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,8 +11,6 @@ module JobParser
11
11
  }
12
12
  final_salary = nil
13
13
  if salary && !salary.empty?
14
- p "got slaary"
15
- p salary
16
14
  SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
17
15
  final_salary = [match[1].to_i, match[2].to_i]
18
16
  }
@@ -1,45 +1,9 @@
1
1
  # encoding: utf-8
2
- require "nokogiri"
3
2
  module JobParser
4
- class ParseHtml
5
- ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
-
7
- attr_reader :doc, :plain_text
8
-
9
- def initialize(html, from_url)
10
- @url = from_url
11
- @doc = strip_bad_elements(Nokogiri::HTML(html))
12
- @doc.css("br").each { |br| br.replace "\n" }
13
- @plain_text = get_plain_text
14
- end
15
-
16
- def job
17
- { :url => @url,
18
- :salary => job_salary,
19
- :title => job_title,
20
- :apply => apply_link,
21
- :salary_string => job_salary_string,
22
- :location => job_location
23
- }
24
- end
3
+ class ParseHtml < Parser
25
4
 
26
5
  private
27
6
 
28
- def strip_bad_elements(doc)
29
- blacklist = ['script', 'style', 'button']
30
- blacklist.each do |tag|
31
- doc.xpath("//#{tag}").remove
32
- end
33
- doc
34
- end
35
-
36
- def get_plain_text
37
- doc = @doc.dup
38
- blacklist = ['title', 'script', 'style', 'button']
39
- nodelist = doc.search('//text()')
40
- nodelist.text
41
- end
42
-
43
7
  def job_location
44
8
  Facets::Location.new(@doc, @url, @plain_text).parse
45
9
  end
@@ -0,0 +1,41 @@
1
+ require "nokogiri"
2
+ module JobParser
3
+ class Parser
4
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
5
+ attr_reader :doc, :plain_text
6
+
7
+ def initialize(html, from_url)
8
+ @url = from_url
9
+ @doc = strip_bad_elements(Nokogiri::HTML(html))
10
+ @plain_text = get_plain_text
11
+ end
12
+
13
+ def job
14
+ { :url => @url,
15
+ :salary => job_salary,
16
+ :title => job_title,
17
+ :apply => apply_link,
18
+ :salary_string => job_salary_string,
19
+ :location => job_location
20
+ }
21
+ end
22
+
23
+ private
24
+
25
+ def strip_bad_elements(doc)
26
+ blacklist = ['script', 'style', 'button']
27
+ blacklist.each do |tag|
28
+ doc.xpath("//#{tag}").remove
29
+ end
30
+ doc.css("br").each { |br| br.replace "\n" }
31
+ doc
32
+ end
33
+
34
+ def get_plain_text
35
+ doc = @doc.dup
36
+ blacklist = ['title', 'script', 'style', 'button']
37
+ nodelist = doc.search('//text()')
38
+ nodelist.text
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module JobParser
3
+ class ParseSchema < Parser
4
+ private
5
+
6
+ def job_salary
7
+ salary = job_salary_string
8
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
9
+ [match[1].to_i, match[2].to_i]
10
+ }
11
+ end
12
+
13
+ def job_title
14
+ get_content_at_prop("title")
15
+ end
16
+
17
+ def apply_link
18
+ Facets::Apply.new(@doc, @url, @plain_text).parse
19
+ end
20
+
21
+ def job_salary_string
22
+ get_content_at_prop("baseSalary")
23
+ end
24
+
25
+ def job_location
26
+ # some sites don't use the address stuff properly
27
+ if is_content_at_prop?("addressLocality")
28
+ get_content_at_prop("addressLocality")
29
+ else
30
+ get_content_at_prop("jobLocation")
31
+ end
32
+ end
33
+
34
+ def does_use_schema?
35
+ @doc.css("*").any? { |elem|
36
+ elem['itemtype'] == "http://schema.org/JobPosting"
37
+ }
38
+ end
39
+
40
+ def get_content_at_prop(prop)
41
+ elem = find_with_itemprop(prop)
42
+ if elem
43
+ Cleaner.strip_all_white_space(find_with_itemprop(prop).content)
44
+ else
45
+ ""
46
+ end
47
+ end
48
+
49
+ def is_content_at_prop?(prop)
50
+ elem = find_with_itemprop("prop")
51
+ elem && !elem.empty?
52
+ end
53
+
54
+ def find_with_itemprop(prop)
55
+ @doc.css("*").select { |elem|
56
+ elem['itemprop'] == prop
57
+ }.first
58
+ end
59
+ end
60
+ end
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  require "jobparser/version"
2
+ require "jobparser/parser"
2
3
  require "jobparser/parsehtml"
4
+ require "jobparser/parseschema"
3
5
  require "jobparser/parseurl"
4
6
  require "jobparser/cleaner"
5
7
  require "jobparser/scorer"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-07-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -106,6 +106,8 @@ files:
106
106
  - lib/jobparser/facets/salarystring.rb
107
107
  - lib/jobparser/facets/title.rb
108
108
  - lib/jobparser/parsehtml.rb
109
+ - lib/jobparser/parser.rb
110
+ - lib/jobparser/parseschema.rb
109
111
  - lib/jobparser/parseurl.rb
110
112
  - lib/jobparser/regex.rb
111
113
  - lib/jobparser/scorer.rb