RubyGems - jobparser - Versions diffs - 0.2.0 → 0.3.0 - Mend

jobparser 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/lib/jobparser/facets/salary.rb +15 -6
data/lib/jobparser/facets/salarystring.rb +5 -4
data/lib/jobparser/facets/title.rb +4 -2
data/lib/jobparser/parsehtml.rb +12 -7
data/lib/jobparser/regex.rb +1 -0
data/lib/jobparser/scorer.rb +5 -0
data/lib/jobparser/version.rb +1 -1
metadata +2 -2

data/lib/jobparser/facets/salary.rb CHANGED Viewed

@@ -6,14 +6,23 @@ module JobParser
         return special_case_result unless special_case_result.nil?
         salary = ""
-        loop_over_elements do |name, elem|
-          SALARY_REGEX.match(@plain_text) { |m|
-            salary = m.to_s
+        SALARY_REGEX.match(@plain_text) { |m|
+          salary = m.to_s
+        }
+        final_salary = nil
+        if salary && !salary.empty?
+          p "got slaary"
+          p salary
+          SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
+            final_salary = [match[1].to_i, match[2].to_i]
+          }
+        else
+          SALARY_UP_TO_REGEX.match(@plain_text) { |m|
+            final_salary = [nil, m[3].gsub(",","").to_i]
           }
         end
-        SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
-          [match[1].to_i, match[2].to_i]
-        } || nil
+        final_salary
       end
     end
   end

data/lib/jobparser/facets/salarystring.rb CHANGED Viewed

@@ -6,11 +6,12 @@ module JobParser
         return special_case_result unless special_case_result.nil?
         salary = ""
-        loop_over_elements do |name, elem|
-          SALARY_STRING_REGEX.match(@plain_text) { |m|
-            salary = m.to_s
-          }
+        SALARY_STRING_REGEX.match(@plain_text) { |m| salary = m.to_s }
+        if salary.empty?
+          SALARY_UP_TO_REGEX.match(@plain_text) { |m| salary = m.to_s }
         end
         Cleaner.strip_all_white_space(salary)
       end
     end

data/lib/jobparser/facets/title.rb CHANGED Viewed

@@ -7,13 +7,15 @@ module JobParser
         title_scorer = Scorer.new
         page_title = @doc.at_css("title").content
-        title_scorer.store(page_title, 20).and_score_now
+        title_scorer.store_and_score(page_title, 20)
         # first see if we find something with a matching id
         loop_over_elements do |name, elem|
-          # check the ID of the elements for matches
           next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
           content = Cleaner.strip_all_white_space(elem.content)
+          # does the element have an id that means it might store the title?
           title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
           # or if a heading element matches the page title

data/lib/jobparser/parsehtml.rb CHANGED Viewed

@@ -4,13 +4,13 @@ module JobParser
   class ParseHtml
     ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
-    attr_reader :doc
+    attr_reader :doc, :plain_text
     def initialize(html, from_url)
       @url = from_url
-      @doc = Nokogiri::HTML(html)
+      @doc = strip_bad_elements(Nokogiri::HTML(html))
       @doc.css("br").each { |br| br.replace "\n" }
-      @plain_text = strip_html
+      @plain_text = get_plain_text
     end
     def job
@@ -25,13 +25,18 @@ module JobParser
     private
-    def strip_html
+    def strip_bad_elements(doc)
+      blacklist = ['script', 'style', 'button']
+      blacklist.each do |tag|
+        doc.xpath("//#{tag}").remove
+      end
+      doc
+    end
+    def get_plain_text
       doc = @doc.dup
       blacklist = ['title', 'script', 'style', 'button']
       nodelist = doc.search('//text()')
-      blacklist.each do |tag|
-        nodelist -= doc.search('//' + tag + '/text()')
-      end
       nodelist.text
     end

data/lib/jobparser/regex.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'nokogiri'
 module JobParser
   SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
   SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
+  SALARY_UP_TO_REGEX = /(up to)(.+)£([\d,]*)/
   SALARY_TITLE_REGEX = /salary|\srate/i
   VACANCY_TITLE_REGEX = /vacancy|job title/i
   JOB_TITLE_ID_REGEX = /job(.?)title|title/i

data/lib/jobparser/scorer.rb CHANGED Viewed

@@ -24,6 +24,11 @@ module JobParser
     def top_match
       @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
     end
+    def store_and_score(str, worth)
+      store(str, worth).and_score_now
+    end
   end
   class Match

data/lib/jobparser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module JobParser
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jobparser
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-24 00:00:00.000000000 Z
+date: 2013-07-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler