RubyGems - jobparser - Versions diffs - 0.13.9 → 0.13.10 - Mend

jobparser 0.13.9 → 0.13.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/jobparser/facets/title.rb +51 -18
data/lib/jobparser/parser.rb +11 -11
data/lib/jobparser/parseschema.rb +1 -5
data/lib/jobparser/version.rb +1 -1
data/lib/jobparser.rb +1 -1
metadata +2 -2

data/lib/jobparser/facets/title.rb CHANGED Viewed

@@ -1,45 +1,78 @@
 module JobParser
   module Facets
     class Title < Facet
+      def initialize(*args)
+        super(*args)
+        @scorer = Scorer.new
+      end
       def parse
         special_case_result = use_special_case(:title)
         return special_case_result unless special_case_result.nil?
-        title_scorer = Scorer.new
-        page_title = @doc.at_css("title").content
-        title_scorer.store_and_score(page_title, 10)
+        score_page_title
         # first see if we find something with a matching id
         loop_over_elements do |name, elem|
-          next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
+          next if elem_not_suitable_as_title?(elem)
           content = Cleaner.strip_all_white_space(elem.content)
           # does the element have an id that means it might store the title?
-          title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
+          elem_has_job_title_id_score(elem, content)
           # or if a heading element matches the page title
-          if elem_is_heading?(name)
-            title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
-          end
+          elem_heading_matches_page_title(elem, content)
-          title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
+          # if it has some common words that feature in job titles
+          elem_matches_job_title_words(content)
-          VACANCY_TITLE_REGEX.match(content) {
-            if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
-              next_content = Cleaner.strip_all_white_space(elem.next_element.content)
-              title_scorer.store(next_content, 30).if_block_true {
-                ACCEPTED_ELEMENTS.include?(elem.next_element.name)
-              }
-            end
-          }
+          # if it's the title, get the content of the next element
+          elem_is_vacancy_title(elem, content)
         end
-        clean_title(title_scorer.top_match.strip.gsub(NBSP, ""))
+        clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
       end
       private
+      def elem_is_vacancy_title(elem, content)
+        VACANCY_TITLE_REGEX.match(content) {
+          if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
+            next_content = Cleaner.strip_all_white_space(elem.next_element.content)
+            @scorer.store(next_content, 30).if_block_true {
+              ACCEPTED_ELEMENTS.include?(elem.next_element.name)
+            }
+          end
+        }
+      end
+      def elem_matches_job_title_words(content)
+        @scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
+      end
+      def elem_heading_matches_page_title(elem, content)
+        if elem_is_heading?(elem.name)
+          @scorer.store(content, 40).if_block_true { page_title.include?(content) }
+        end
+      end
+      def elem_has_job_title_id_score(elem, content)
+        @scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
+      end
+      def page_title
+        @doc.at_css("title").content
+      end
+      def score_page_title
+        @scorer.store_and_score(page_title, 10)
+      end
+      def elem_not_suitable_as_title?(elem)
+        elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
+      end
       def elem_is_heading?(name)
         %w{h1 h2 h3 h4 h5}.include?(name)
       end

data/lib/jobparser/parser.rb CHANGED Viewed

@@ -12,18 +12,18 @@ module JobParser
     def job
       if JobParser.cache.valid_for_url?(@url)
-        return JobParser.cache.fetch_result_for_url(@url)
+        JobParser.cache.fetch_result_for_url(@url)
+      else
+        { :url => @url,
+          :salary => job_salary,
+          :title => job_title,
+          :apply => apply_link,
+          :salary_string => job_salary_string,
+          :location => job_location,
+          :deadline => deadline,
+          :postcode => job_postcode
+        }
       end
-      { :url => @url,
-        :salary => job_salary,
-        :title => job_title,
-        :apply => apply_link,
-        :salary_string => job_salary_string,
-        :location => job_location,
-        :deadline => deadline,
-        :postcode => job_postcode
-      }
     end
     private

data/lib/jobparser/parseschema.rb CHANGED Viewed

@@ -96,11 +96,7 @@ module JobParser
     def get_content_at_prop(prop)
       elem = find_with_itemprop(prop)
-      if elem
-        Cleaner.strip_all_white_space(elem.content)
-      else
-        ""
-      end
+      elem ? Cleaner.strip_all_white_space(elem.content) : ""
     end
     def is_content_at_prop?(prop)

data/lib/jobparser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module JobParser
-  VERSION = "0.13.9"
+  VERSION = "0.13.10"
 end

data/lib/jobparser.rb CHANGED Viewed

@@ -40,7 +40,7 @@ module JobParser
           ParseHtml.new(html, url)
         end
       rescue URI::InvalidURIError
-        raise JobParser::Error::InvalidUrl, "The URI given was not valid"
+        raise JobParser::Error::InvalidUrl, "The URI given (\"#{url}\") was not valid"
       end
     end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jobparser
 version: !ruby/object:Gem::Version
-  version: 0.13.9
+  version: 0.13.10
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-23 00:00:00.000000000 Z
+date: 2013-08-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler