jobparser 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/cache.rb +57 -0
- data/lib/jobparser/parser.rb +20 -4
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +25 -0
- metadata +19 -2
| @@ -0,0 +1,57 @@ | |
| 1 | 
            +
            require 'digest/md5'
         | 
| 2 | 
            +
            require 'json'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module JobParser
         | 
| 5 | 
            +
              class Cache
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                def has_cache_for_url?(url)
         | 
| 8 | 
            +
                  path = path_for_url(url)
         | 
| 9 | 
            +
                  File.exist?(path)
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def fetch_result_for_url(url)
         | 
| 13 | 
            +
                  path = path_for_url(url)
         | 
| 14 | 
            +
                  obj = JSON.parse(IO.read(path))
         | 
| 15 | 
            +
                  sym_obj = {}
         | 
| 16 | 
            +
                  obj.each { |k, v| sym_obj[k.to_sym] = v }
         | 
| 17 | 
            +
                  sym_obj[:from_hash] = true
         | 
| 18 | 
            +
                  sym_obj
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                def store_to_file(job_hash)
         | 
| 22 | 
            +
                  url = job_hash[:url]
         | 
| 23 | 
            +
                  write_to_file(path_for_url(url), job_hash.to_json)
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def cache_expired?(url)
         | 
| 27 | 
            +
                  !cache_not_expired?(url)
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def cache_not_expired?(url)
         | 
| 31 | 
            +
                  time = File.mtime(path_for_url(url))
         | 
| 32 | 
            +
                  expire_time = time + JobParser.config[:cache_expire]
         | 
| 33 | 
            +
                  Time.now < expire_time
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def clear_all
         | 
| 37 | 
            +
                  files = Dir[File.join(JobParser.config[:cache_location], "*.txt")]
         | 
| 38 | 
            +
                  files.each { |f| File.delete(f) }
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                private
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
                def write_to_file(path, contents)
         | 
| 45 | 
            +
                  File.open(path, "w") { |f| f.puts(contents) }
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def path_for_url(url)
         | 
| 49 | 
            +
                  cache_dir = JobParser.config[:cache_location]
         | 
| 50 | 
            +
                  File.join(cache_dir, md5_url(url))
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                def md5_url(url)
         | 
| 54 | 
            +
                  "#{Digest::MD5.hexdigest(url)}.txt"
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
              end
         | 
| 57 | 
            +
            end
         | 
    
        data/lib/jobparser/parser.rb
    CHANGED
    
    | @@ -2,16 +2,25 @@ require "nokogiri" | |
| 2 2 | 
             
            module JobParser
         | 
| 3 3 | 
             
              class Parser
         | 
| 4 4 | 
             
                ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
         | 
| 5 | 
            -
                attr_reader :doc, :plain_text
         | 
| 6 5 |  | 
| 7 6 | 
             
                def initialize(html, from_url)
         | 
| 8 7 | 
             
                  @url = from_url
         | 
| 9 | 
            -
                  @ | 
| 10 | 
            -
                  @plain_text = get_plain_text
         | 
| 8 | 
            +
                  @html = html
         | 
| 11 9 | 
             
                end
         | 
| 12 10 |  | 
| 13 11 | 
             
                def job
         | 
| 14 | 
            -
                   | 
| 12 | 
            +
                  if JobParser.config[:cache_on]
         | 
| 13 | 
            +
                    if JobParser.cache.has_cache_for_url?(@url)
         | 
| 14 | 
            +
                      if JobParser.cache.cache_not_expired?(@url)
         | 
| 15 | 
            +
                        return JobParser.cache.fetch_result_for_url(@url)
         | 
| 16 | 
            +
                      end
         | 
| 17 | 
            +
                    end
         | 
| 18 | 
            +
                  end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  @doc = strip_bad_elements(Nokogiri::HTML(@html))
         | 
| 21 | 
            +
                  @plain_text = get_plain_text
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  result = { :url => @url,
         | 
| 15 24 | 
             
                    :salary => job_salary,
         | 
| 16 25 | 
             
                    :title => job_title,
         | 
| 17 26 | 
             
                    :apply => apply_link,
         | 
| @@ -19,10 +28,17 @@ module JobParser | |
| 19 28 | 
             
                    :location => job_location,
         | 
| 20 29 | 
             
                    :deadline => deadline
         | 
| 21 30 | 
             
                  }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  store_result_to_cache(result) if JobParser.config[:cache_on]
         | 
| 33 | 
            +
                  result
         | 
| 22 34 | 
             
                end
         | 
| 23 35 |  | 
| 24 36 | 
             
                private
         | 
| 25 37 |  | 
| 38 | 
            +
                def store_result_to_cache(result)
         | 
| 39 | 
            +
                  JobParser.cache.store_to_file(result)
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
             | 
| 26 42 | 
             
                def strip_bad_elements(doc)
         | 
| 27 43 | 
             
                  blacklist = ['script', 'style', 'button']
         | 
| 28 44 | 
             
                  blacklist.each do |tag|
         | 
    
        data/lib/jobparser/version.rb
    CHANGED
    
    
    
        data/lib/jobparser.rb
    CHANGED
    
    | @@ -6,6 +6,7 @@ require "jobparser/parseurl" | |
| 6 6 | 
             
            require "jobparser/cleaner"
         | 
| 7 7 | 
             
            require "jobparser/scorer"
         | 
| 8 8 | 
             
            require "jobparser/specialcases"
         | 
| 9 | 
            +
            require "jobparser/cache"
         | 
| 9 10 | 
             
            require "jobparser/facets/facet"
         | 
| 10 11 | 
             
            require "jobparser/facets/salary"
         | 
| 11 12 | 
             
            require "jobparser/facets/salarystring"
         | 
| @@ -19,10 +20,34 @@ require "open-uri" | |
| 19 20 | 
             
            module JobParser
         | 
| 20 21 | 
             
              def self.parser(url)
         | 
| 21 22 | 
             
                html = open(url, :allow_redirections => :safe).read
         | 
| 23 | 
            +
             | 
| 22 24 | 
             
                if html.include?("http://schema.org/JobPosting")
         | 
| 23 25 | 
             
                  ParseSchema.new(html, url)
         | 
| 24 26 | 
             
                else
         | 
| 25 27 | 
             
                  ParseHtml.new(html, url)
         | 
| 26 28 | 
             
                end
         | 
| 27 29 | 
             
              end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              def self.config
         | 
| 32 | 
            +
                @config
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
              def self.cache
         | 
| 36 | 
            +
                @cache
         | 
| 37 | 
            +
              end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              def self.configure(opts = {})
         | 
| 40 | 
            +
                opts.each do |key, val|
         | 
| 41 | 
            +
                  @config[key.to_sym] = val if @config.keys.include?(key.to_sym)
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
              @cache = Cache.new
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              @config = {
         | 
| 48 | 
            +
                :cache_on => false,
         | 
| 49 | 
            +
                :cache_expire => (1 * 60 * 60), # an hour
         | 
| 50 | 
            +
                :cache_location => "cache"
         | 
| 51 | 
            +
              }
         | 
| 52 | 
            +
             | 
| 28 53 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: jobparser
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.6.0
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2013-08- | 
| 12 | 
            +
            date: 2013-08-05 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: bundler
         | 
| @@ -59,6 +59,22 @@ dependencies: | |
| 59 59 | 
             
                - - ! '>='
         | 
| 60 60 | 
             
                  - !ruby/object:Gem::Version
         | 
| 61 61 | 
             
                    version: '0'
         | 
| 62 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 63 | 
            +
              name: timecop
         | 
| 64 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 65 | 
            +
                none: false
         | 
| 66 | 
            +
                requirements:
         | 
| 67 | 
            +
                - - ! '>='
         | 
| 68 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 69 | 
            +
                    version: '0'
         | 
| 70 | 
            +
              type: :development
         | 
| 71 | 
            +
              prerelease: false
         | 
| 72 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 73 | 
            +
                none: false
         | 
| 74 | 
            +
                requirements:
         | 
| 75 | 
            +
                - - ! '>='
         | 
| 76 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 77 | 
            +
                    version: '0'
         | 
| 62 78 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 63 79 | 
             
              name: open_uri_redirections
         | 
| 64 80 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -98,6 +114,7 @@ executables: [] | |
| 98 114 | 
             
            extensions: []
         | 
| 99 115 | 
             
            extra_rdoc_files: []
         | 
| 100 116 | 
             
            files:
         | 
| 117 | 
            +
            - lib/jobparser/cache.rb
         | 
| 101 118 | 
             
            - lib/jobparser/cleaner.rb
         | 
| 102 119 | 
             
            - lib/jobparser/facets/apply.rb
         | 
| 103 120 | 
             
            - lib/jobparser/facets/deadline.rb
         |