RubyGems - jobparser - Versions diffs - 0.0.2 - Mend

jobparser 0.0.2

Files changed (8) hide show

data/lib/jobparser.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require "jobparser/version"
+require "jobparser/parsehtml"
+require "jobparser/parseurl"
+require "jobparser/cleaner"
+require "jobparser/scorer"
+module JobParser
+  # Your code goes here...
+end

data/lib/jobparser/cleaner.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# encoding: UTF-8
+require "nokogiri"
+require "jobparser/regex"
+module JobParser
+  class Cleaner
+    SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
+    CLEAN_SALARY_REGEX = /,|\s/
+    NBSP = Nokogiri::HTML("&nbsp;").text
+    def initialize(ary, opts = {})
+      @subject = ary
+      @type = opts[:type]
+    end
+    def clean
+      clean_array
+    end
+    def self.clean_salary(salary_str)
+      SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
+        [match[1].to_i, match[2].to_i]
+      }
+    end
+    def self.strip_string(str)
+      str.gsub('/n', '').gsub(NBSP, '').strip
+    end
+    def self.make_link_absolute(url, href)
+      if href.include?("http")
+        href
+      else
+        uri = URI.parse(url)
+        base = "#{uri.scheme}://#{uri.host}"
+        if base[-1] == "/" || href[0] == "/"
+          base + href
+        else
+          "#{base}/#{href}"
+        end
+      end
+    end
+    private
+    def clean_array
+      @subject.select { |item|
+        not_whitespace_or_empty(item)
+      }.map { |item|
+        clean_string(item) if item.is_a?(String)
+      }.uniq
+    end
+    def clean_string(str)
+      self.class.strip_string(str)
+    end
+    def not_whitespace_or_empty(item)
+      /^\s+$/.match(item) == nil && !item.empty?
+    end
+  end
+end

data/lib/jobparser/parsehtml.rb ADDED Viewed

@@ -0,0 +1,145 @@
+# encoding: utf-8
+require 'jobparser/regex.rb'
+require "nokogiri"
+module JobParser
+  class ParseHtml
+    ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
+    attr_reader :doc
+    def initialize(html, from_url)
+      @url = from_url
+      @doc = Nokogiri::HTML(html)
+      @doc.css("br").each { |br| br.replace "\n" }
+      @plain_text = strip_html
+    end
+    def job
+      { :url => @url,
+        :salary => job_salary,
+        :title => job_title,
+        :apply => apply_link,
+        :salary_string => job_salary_string,
+        :location => job_location
+      }
+    end
+    private
+    def job_location
+      LOCATION_REGEX.match(@plain_text.gsub(/\r|\t/, "")) { |m|
+        Cleaner.strip_string(m[1].to_s)
+      } || ""
+    end
+    def strip_html
+      doc = @doc.dup
+      blacklist = ['title', 'script', 'style', 'button']
+      nodelist = doc.search('//text()')
+      blacklist.each do |tag|
+        nodelist -= doc.search('//' + tag + '/text()')
+      end
+      nodelist.text
+    end
+    def loop_over_elements(&block)
+      elements.each do |name, elems|
+        elems.each do |elem|
+          yield name, elem
+        end
+      end
+    end
+    def clean_array(ary, type = nil)
+      Cleaner.new(ary, :type => type).clean
+    end
+    def job_salary_string
+      salary = ""
+      loop_over_elements do |name, elem|
+        SALARY_STRING_REGEX.match(@plain_text) { |m|
+          salary = m.to_s
+        }
+      end
+      Cleaner.strip_string(salary)
+    end
+    def job_salary
+      salary = ""
+      loop_over_elements do |name, elem|
+        SALARY_REGEX.match(@plain_text) { |m|
+          salary = m.to_s
+        }
+      end
+      salary.empty? ? nil : Cleaner.clean_salary(salary)
+    end
+    def job_title
+      title_scorer = Scorer.new
+      page_title = @doc.at_css("title").content
+      title_scorer.store(page_title, 20).and_score_now
+      # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
+      nbsp = Nokogiri::HTML("&nbsp;").text
+      # first see if we find something with a matching id
+      loop_over_elements do |name, elem|
+        # check the ID of the elements for matches
+        next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
+        content = Cleaner.strip_string(elem.content)
+        title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
+        # or if a heading element matches the page title
+        if elem_is_heading?(name)
+          title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
+        end
+        title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
+        VACANCY_TITLE_REGEX.match(content) {
+          if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
+            next_content = Cleaner.strip_string(elem.next_element.content)
+            title_scorer.store(next_content, 30).if_block_true {
+              ACCEPTED_ELEMENTS.include?(elem.next_element.name)
+            }
+          end
+        }
+      end
+      title_scorer.top_match.strip.gsub(nbsp, "")
+    end
+    def apply_link
+      link = nil
+      anchor_elements.each do |anchor|
+        APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
+      end
+      if link
+        Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
+      else
+        @url
+      end
+    end
+    def elem_is_heading?(name)
+      %w{h1 h2 h3 h4 h5}.include?(name)
+    end
+    def heading_elements
+      elements.select { |elem| elem_is_heading?(elem) }
+    end
+    def anchor_elements
+      elements["a"]
+    end
+    def elements
+      {}.tap do |response|
+        ACCEPTED_ELEMENTS.each do |elem|
+          response[elem] = doc.css(elem).to_a
+        end
+      end
+    end
+  end
+end

data/lib/jobparser/parseurl.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'open_uri_redirections'
+module JobParser
+  class ParseUrl
+    def initialize(url)
+      ParseHtml.new(open(url, :allow_redirections => :safe).read, url)
+    end
+  end
+end

data/lib/jobparser/regex.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# encoding: utf-8
+require 'nokogiri'
+module JobParser
+  SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
+  SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
+  SALARY_TITLE_REGEX = /salary|\srate/i
+  VACANCY_TITLE_REGEX = /vacancy|job title/i
+  JOB_TITLE_ID_REGEX = /job(.?)title|title/i
+  APPLY_LINK_REGEX = /^apply|submit an application|application form/i
+  NBSP = Nokogiri::HTML("&nbsp;").text
+  LOCATION_REGEX = /(?:location: )([\w\s&]*)$/i
+  # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
+  # could scope this regex just to headers
+  JOB_TITLE_WORDS = /representative|sales|nurse|manager/i
+end

data/lib/jobparser/scorer.rb ADDED Viewed

@@ -0,0 +1,57 @@
+module JobParser
+  class Scorer
+    attr_reader :matches
+    def initialize
+      @matches = {}
+    end
+    def store(str, worth)
+      match = nil
+      if match = @matches[str]
+        match = Match.new(str, worth, match.score)
+      else
+        match = Match.new(str, worth)
+      end
+      @matches[str] = match
+      match
+    end
+    def score_for(str)
+      @matches[str].nil? ? 0 : @matches[str].score
+    end
+    def top_match
+      @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
+    end
+  end
+  class Match
+    attr_accessor :str, :worth
+    attr_reader :score
+    def initialize(str, worth, score = 0)
+      @str = str
+      @score = score
+      @worth = worth
+    end
+    def if_regex_match(reg, str)
+      reg.match(str) {
+        @score += @worth
+        true
+      }
+      false
+    end
+    def if_block_true(&block)
+      res = yield
+      @score += @worth if res
+      res
+    end
+    def and_score_now
+      @score += @worth
+      self
+    end
+  end
+end

data/lib/jobparser/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module JobParser
+  VERSION = "0.0.2"
+end

metadata ADDED Viewed

@@ -0,0 +1,133 @@
+--- !ruby/object:Gem::Specification
+name: jobparser
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+  prerelease:
+platform: ruby
+authors:
+- Jack Franklin
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-07-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: open_uri_redirections
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A parser for Job sites
+email:
+- jack@jackfranklin.net
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/jobparser/cleaner.rb
+- lib/jobparser/parsehtml.rb
+- lib/jobparser/parseurl.rb
+- lib/jobparser/regex.rb
+- lib/jobparser/scorer.rb
+- lib/jobparser/version.rb
+- lib/jobparser.rb
+homepage: ''
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Parsing job sites
+test_files: []