RubyGems - jobparser - Versions diffs - 0.1.1 → 0.2.0 - Mend

jobparser 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/jobparser/cleaner.rb +6 -37
data/lib/jobparser/facets/apply.rb +18 -0
data/lib/jobparser/facets/facet.rb +41 -0
data/lib/jobparser/facets/location.rb +15 -0
data/lib/jobparser/facets/salary.rb +20 -0
data/lib/jobparser/facets/salarystring.rb +18 -0
data/lib/jobparser/facets/title.rb +46 -0
data/lib/jobparser/parsehtml.rb +6 -111
data/lib/jobparser/regex.rb +2 -0
data/lib/jobparser/specialcases.rb +1 -1
data/lib/jobparser/version.rb +1 -1
data/lib/jobparser.rb +6 -0
metadata +8 -2

data/lib/jobparser/cleaner.rb CHANGED Viewed

@@ -5,30 +5,16 @@ require "jobparser/regex"
 module JobParser
   class Cleaner
-    SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
-    CLEAN_SALARY_REGEX = /,|\s/
-    def initialize(ary, opts = {})
-      @subject = ary
-      @type = opts[:type]
-    end
-    def clean
-      clean_array
+    def self.strip_all_white_space(str)
+      Cleaner.clean_text(Cleaner.remove_nbsp(str.gsub('\n', ''))).strip
     end
-    def self.clean_salary(salary_str)
-      SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
-        [match[1].to_i, match[2].to_i]
-      }
-    end
-    def self.strip_string(str)
-      str.gsub('/n', '').gsub(NBSP, '').strip
+    def self.clean_text(str)
+      str.gsub(/\r|\t/, "").gsub(NBSP, " ")
     end
-    def self.clean_plain_text(str)
-      str.gsub(/\r|\t/, "").gsub(NBSP, " ")
+    def self.remove_nbsp(str)
+      str.gsub(NBSP, "")
     end
     def self.make_link_absolute(url, href)
@@ -45,22 +31,5 @@ module JobParser
       end
     end
-    private
-    def clean_array
-      @subject.select { |item|
-        not_whitespace_or_empty(item)
-      }.map { |item|
-        clean_string(item) if item.is_a?(String)
-      }.uniq
-    end
-    def clean_string(str)
-      self.class.strip_string(str)
-    end
-    def not_whitespace_or_empty(item)
-      /^\s+$/.match(item) == nil && !item.empty?
-    end
   end
 end

data/lib/jobparser/facets/apply.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module JobParser
+  module Facets
+    class Apply < Facet
+      def parse
+        link = nil
+        elements["a"].each do |anchor|
+          APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
+        end
+        if link
+          Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
+        else
+          @url
+        end
+      end
+    end
+  end
+end

data/lib/jobparser/facets/facet.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module JobParser
+  module Facets
+    class Facet
+      ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
+      attr_reader :doc, :url, :plain_text
+      def initialize(doc, url, plain_text)
+        @doc = doc
+        @url = url
+        @plain_text = plain_text
+      end
+      private
+      def use_special_case(name)
+        if special_case = SpecialCases.case_for_url(@url)
+          special_case[name].call(@doc)
+        end
+      end
+      def loop_over_elements(&block)
+        elements.each do |name, elems|
+          elems.each do |elem|
+            yield name, elem
+          end
+        end
+      end
+      def elements
+        {}.tap do |response|
+          ACCEPTED_ELEMENTS.each do |elem|
+            response[elem] = @doc.css(elem).to_a
+          end
+        end
+      end
+    end
+  end
+end

data/lib/jobparser/facets/location.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module JobParser
+  module Facets
+    class Location < Facet
+      def parse
+        special_case_result = use_special_case(:location)
+        return special_case_result unless special_case_result.nil?
+        LOCATION_REGEX.match(Cleaner.clean_text(@plain_text)) { |m|
+          Cleaner.strip_all_white_space(m[1].to_s)
+        } || ""
+      end
+    end
+  end
+end

data/lib/jobparser/facets/salary.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module JobParser
+  module Facets
+    class Salary < Facet
+      def parse
+        special_case_result = use_special_case(:salary)
+        return special_case_result unless special_case_result.nil?
+        salary = ""
+        loop_over_elements do |name, elem|
+          SALARY_REGEX.match(@plain_text) { |m|
+            salary = m.to_s
+          }
+        end
+        SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
+          [match[1].to_i, match[2].to_i]
+        } || nil
+      end
+    end
+  end
+end

data/lib/jobparser/facets/salarystring.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module JobParser
+  module Facets
+    class SalaryString < Facet
+      def parse
+        special_case_result = use_special_case(:salary_string)
+        return special_case_result unless special_case_result.nil?
+        salary = ""
+        loop_over_elements do |name, elem|
+          SALARY_STRING_REGEX.match(@plain_text) { |m|
+            salary = m.to_s
+          }
+        end
+        Cleaner.strip_all_white_space(salary)
+      end
+    end
+  end
+end

data/lib/jobparser/facets/title.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module JobParser
+  module Facets
+    class Title < Facet
+      def parse
+        special_case_result = use_special_case(:title)
+        return special_case_result unless special_case_result.nil?
+        title_scorer = Scorer.new
+        page_title = @doc.at_css("title").content
+        title_scorer.store(page_title, 20).and_score_now
+        # first see if we find something with a matching id
+        loop_over_elements do |name, elem|
+          # check the ID of the elements for matches
+          next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
+          content = Cleaner.strip_all_white_space(elem.content)
+          title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
+          # or if a heading element matches the page title
+          if elem_is_heading?(name)
+            title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
+          end
+          title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
+          VACANCY_TITLE_REGEX.match(content) {
+            if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
+              next_content = Cleaner.strip_all_white_space(elem.next_element.content)
+              title_scorer.store(next_content, 30).if_block_true {
+                ACCEPTED_ELEMENTS.include?(elem.next_element.name)
+              }
+            end
+          }
+        end
+        title_scorer.top_match.strip.gsub(NBSP, "")
+      end
+      private
+      def elem_is_heading?(name)
+        %w{h1 h2 h3 h4 h5}.include?(name)
+      end
+    end
+  end
+end

data/lib/jobparser/parsehtml.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 # encoding: utf-8
-require 'jobparser/regex.rb'
 require "nokogiri"
 module JobParser
   class ParseHtml
@@ -26,15 +25,6 @@ module JobParser
     private
-    def job_location
-      special_case_result = use_special_case(:location)
-      return special_case_result unless special_case_result.nil?
-      LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
-        Cleaner.strip_string(m[1].to_s)
-      } || ""
-    end
     def strip_html
       doc = @doc.dup
       blacklist = ['title', 'script', 'style', 'button']
@@ -45,119 +35,24 @@ module JobParser
       nodelist.text
     end
-    def loop_over_elements(&block)
-      elements.each do |name, elems|
-        elems.each do |elem|
-          yield name, elem
-        end
-      end
-    end
-    def clean_array(ary, type = nil)
-      Cleaner.new(ary, :type => type).clean
-    end
-    def use_special_case(name)
-      if special_case = SpecialCases.case_for_url(@url)
-        special_case[name].call(@doc)
-      end
+    def job_location
+      Facets::Location.new(@doc, @url, @plain_text).parse
     end
     def job_salary_string
-      special_case_result = use_special_case(:salary_string)
-      return special_case_result unless special_case_result.nil?
-      salary = ""
-      loop_over_elements do |name, elem|
-        SALARY_STRING_REGEX.match(@plain_text) { |m|
-          salary = m.to_s
-        }
-      end
-      Cleaner.strip_string(salary)
+      Facets::SalaryString.new(@doc, @url, @plain_text).parse
     end
     def job_salary
-      special_case_result = use_special_case(:salary)
-      return special_case_result unless special_case_result.nil?
-      salary = ""
-      loop_over_elements do |name, elem|
-        SALARY_REGEX.match(@plain_text) { |m|
-          salary = m.to_s
-        }
-      end
-      salary.empty? ? nil : Cleaner.clean_salary(salary)
+      Facets::Salary.new(@doc, @url, @plain_text).parse
     end
     def job_title
-      special_case_result = use_special_case(:title)
-      return special_case_result unless special_case_result.nil?
-      title_scorer = Scorer.new
-      page_title = @doc.at_css("title").content
-      title_scorer.store(page_title, 20).and_score_now
-      # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
-      nbsp = Nokogiri::HTML("&nbsp;").text
-      # first see if we find something with a matching id
-      loop_over_elements do |name, elem|
-        # check the ID of the elements for matches
-        next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
-        content = Cleaner.strip_string(elem.content)
-        title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
-        # or if a heading element matches the page title
-        if elem_is_heading?(name)
-          title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
-        end
-        title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
-        VACANCY_TITLE_REGEX.match(content) {
-          if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
-            next_content = Cleaner.strip_string(elem.next_element.content)
-            title_scorer.store(next_content, 30).if_block_true {
-              ACCEPTED_ELEMENTS.include?(elem.next_element.name)
-            }
-          end
-        }
-      end
-      title_scorer.top_match.strip.gsub(nbsp, "")
+      Facets::Title.new(@doc, @url, @pplain_text).parse
     end
     def apply_link
-      link = nil
-      anchor_elements.each do |anchor|
-        APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
-      end
-      if link
-        Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
-      else
-        @url
-      end
-    end
-    def elem_is_heading?(name)
-      %w{h1 h2 h3 h4 h5}.include?(name)
-    end
-    def heading_elements
-      elements.select { |elem| elem_is_heading?(elem) }
-    end
-    def anchor_elements
-      elements["a"]
-    end
-    def elements
-      {}.tap do |response|
-        ACCEPTED_ELEMENTS.each do |elem|
-          response[elem] = doc.css(elem).to_a
-        end
-      end
+      Facets::Apply.new(@doc, @url, @plain_text).parse
     end
   end
 end

data/lib/jobparser/regex.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module JobParser
   APPLY_LINK_REGEX = /^apply|submit an application|application form/i
   NBSP = Nokogiri::HTML("&nbsp;").text
   LOCATION_REGEX = /(?:location: )([\D]*)$/i
+  SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
+  CLEAN_SALARY_REGEX = /,|\s/
   # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
   # could scope this regex just to headers

data/lib/jobparser/specialcases.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module JobParser
             listings = doc.css(".jobViewSummary dl dt")
             listings.each do |dt|
               if dt.content == "Salary"
-                salary = Cleaner.strip_string(dt.next_element.content)
+                salary = Cleaner.remove_nbsp(dt.next_element.content)
                 break
               end
             end

data/lib/jobparser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module JobParser
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

data/lib/jobparser.rb CHANGED Viewed

@@ -4,6 +4,12 @@ require "jobparser/parseurl"
 require "jobparser/cleaner"
 require "jobparser/scorer"
 require "jobparser/specialcases"
+require "jobparser/facets/facet"
+require "jobparser/facets/salary"
+require "jobparser/facets/salarystring"
+require "jobparser/facets/location"
+require "jobparser/facets/apply"
+require "jobparser/facets/title"
 module JobParser
   # Your code goes here...

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jobparser
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-23 00:00:00.000000000 Z
+date: 2013-07-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -99,6 +99,12 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/jobparser/cleaner.rb
+- lib/jobparser/facets/apply.rb
+- lib/jobparser/facets/facet.rb
+- lib/jobparser/facets/location.rb
+- lib/jobparser/facets/salary.rb
+- lib/jobparser/facets/salarystring.rb
+- lib/jobparser/facets/title.rb
 - lib/jobparser/parsehtml.rb
 - lib/jobparser/parseurl.rb
 - lib/jobparser/regex.rb