RubyGems - jobparser - Versions diffs - 0.3.0 → 0.4.0 - Mend

jobparser 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/jobparser/facets/salary.rb +0 -2
data/lib/jobparser/parsehtml.rb +1 -37
data/lib/jobparser/parser.rb +41 -0
data/lib/jobparser/parseschema.rb +60 -0
data/lib/jobparser/version.rb +1 -1
data/lib/jobparser.rb +2 -0
metadata +4 -2

data/lib/jobparser/facets/salary.rb CHANGED Viewed

@@ -11,8 +11,6 @@ module JobParser
         }
         final_salary = nil
         if salary && !salary.empty?
-          p "got slaary"
-          p salary
           SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
             final_salary = [match[1].to_i, match[2].to_i]
           }

data/lib/jobparser/parsehtml.rb CHANGED Viewed

@@ -1,45 +1,9 @@
 # encoding: utf-8
-require "nokogiri"
 module JobParser
-  class ParseHtml
-    ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
-    attr_reader :doc, :plain_text
-    def initialize(html, from_url)
-      @url = from_url
-      @doc = strip_bad_elements(Nokogiri::HTML(html))
-      @doc.css("br").each { |br| br.replace "\n" }
-      @plain_text = get_plain_text
-    end
-    def job
-      { :url => @url,
-        :salary => job_salary,
-        :title => job_title,
-        :apply => apply_link,
-        :salary_string => job_salary_string,
-        :location => job_location
-      }
-    end
+  class ParseHtml < Parser
     private
-    def strip_bad_elements(doc)
-      blacklist = ['script', 'style', 'button']
-      blacklist.each do |tag|
-        doc.xpath("//#{tag}").remove
-      end
-      doc
-    end
-    def get_plain_text
-      doc = @doc.dup
-      blacklist = ['title', 'script', 'style', 'button']
-      nodelist = doc.search('//text()')
-      nodelist.text
-    end
     def job_location
       Facets::Location.new(@doc, @url, @plain_text).parse
     end

data/lib/jobparser/parser.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require "nokogiri"
+module JobParser
+  class Parser
+    ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
+    attr_reader :doc, :plain_text
+    def initialize(html, from_url)
+      @url = from_url
+      @doc = strip_bad_elements(Nokogiri::HTML(html))
+      @plain_text = get_plain_text
+    end
+    def job
+      { :url => @url,
+        :salary => job_salary,
+        :title => job_title,
+        :apply => apply_link,
+        :salary_string => job_salary_string,
+        :location => job_location
+      }
+    end
+    private
+    def strip_bad_elements(doc)
+      blacklist = ['script', 'style', 'button']
+      blacklist.each do |tag|
+        doc.xpath("//#{tag}").remove
+      end
+      doc.css("br").each { |br| br.replace "\n" }
+      doc
+    end
+    def get_plain_text
+      doc = @doc.dup
+      blacklist = ['title', 'script', 'style', 'button']
+      nodelist = doc.search('//text()')
+      nodelist.text
+    end
+  end
+end

data/lib/jobparser/parseschema.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# encoding: utf-8
+module JobParser
+  class ParseSchema < Parser
+    private
+    def job_salary
+      salary = job_salary_string
+      SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
+        [match[1].to_i, match[2].to_i]
+      }
+    end
+    def job_title
+      get_content_at_prop("title")
+    end
+    def apply_link
+      Facets::Apply.new(@doc, @url, @plain_text).parse
+    end
+    def job_salary_string
+      get_content_at_prop("baseSalary")
+    end
+    def job_location
+      # some sites don't use the address stuff properly
+      if is_content_at_prop?("addressLocality")
+        get_content_at_prop("addressLocality")
+      else
+        get_content_at_prop("jobLocation")
+      end
+    end
+    def does_use_schema?
+      @doc.css("*").any? { |elem|
+        elem['itemtype'] == "http://schema.org/JobPosting"
+      }
+    end
+    def get_content_at_prop(prop)
+      elem = find_with_itemprop(prop)
+      if elem
+        Cleaner.strip_all_white_space(find_with_itemprop(prop).content)
+      else
+        ""
+      end
+    end
+    def is_content_at_prop?(prop)
+      elem = find_with_itemprop("prop")
+      elem && !elem.empty?
+    end
+    def find_with_itemprop(prop)
+      @doc.css("*").select { |elem|
+        elem['itemprop'] == prop
+      }.first
+    end
+  end
+end

data/lib/jobparser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module JobParser
-  VERSION = "0.3.0"
+  VERSION = "0.4.0"
 end

data/lib/jobparser.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 require "jobparser/version"
+require "jobparser/parser"
 require "jobparser/parsehtml"
+require "jobparser/parseschema"
 require "jobparser/parseurl"
 require "jobparser/cleaner"
 require "jobparser/scorer"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jobparser
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-25 00:00:00.000000000 Z
+date: 2013-07-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -106,6 +106,8 @@ files:
 - lib/jobparser/facets/salarystring.rb
 - lib/jobparser/facets/title.rb
 - lib/jobparser/parsehtml.rb
+- lib/jobparser/parser.rb
+- lib/jobparser/parseschema.rb
 - lib/jobparser/parseurl.rb
 - lib/jobparser/regex.rb
 - lib/jobparser/scorer.rb