jobparser 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ require 'digest/md5'
2
+ require 'json'
3
+
4
+ module JobParser
5
+ class Cache
6
+
7
+ def has_cache_for_url?(url)
8
+ path = path_for_url(url)
9
+ File.exist?(path)
10
+ end
11
+
12
+ def fetch_result_for_url(url)
13
+ path = path_for_url(url)
14
+ obj = JSON.parse(IO.read(path))
15
+ sym_obj = {}
16
+ obj.each { |k, v| sym_obj[k.to_sym] = v }
17
+ sym_obj[:from_hash] = true
18
+ sym_obj
19
+ end
20
+
21
+ def store_to_file(job_hash)
22
+ url = job_hash[:url]
23
+ write_to_file(path_for_url(url), job_hash.to_json)
24
+ end
25
+
26
+ def cache_expired?(url)
27
+ !cache_not_expired?(url)
28
+ end
29
+
30
+ def cache_not_expired?(url)
31
+ time = File.mtime(path_for_url(url))
32
+ expire_time = time + JobParser.config[:cache_expire]
33
+ Time.now < expire_time
34
+ end
35
+
36
+ def clear_all
37
+ files = Dir[File.join(JobParser.config[:cache_location], "*.txt")]
38
+ files.each { |f| File.delete(f) }
39
+ end
40
+
41
+ private
42
+
43
+
44
+ def write_to_file(path, contents)
45
+ File.open(path, "w") { |f| f.puts(contents) }
46
+ end
47
+
48
+ def path_for_url(url)
49
+ cache_dir = JobParser.config[:cache_location]
50
+ File.join(cache_dir, md5_url(url))
51
+ end
52
+
53
+ def md5_url(url)
54
+ "#{Digest::MD5.hexdigest(url)}.txt"
55
+ end
56
+ end
57
+ end
@@ -2,16 +2,25 @@ require "nokogiri"
2
2
  module JobParser
3
3
  class Parser
4
4
  ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
5
- attr_reader :doc, :plain_text
6
5
 
7
6
  def initialize(html, from_url)
8
7
  @url = from_url
9
- @doc = strip_bad_elements(Nokogiri::HTML(html))
10
- @plain_text = get_plain_text
8
+ @html = html
11
9
  end
12
10
 
13
11
  def job
14
- { :url => @url,
12
+ if JobParser.config[:cache_on]
13
+ if JobParser.cache.has_cache_for_url?(@url)
14
+ if JobParser.cache.cache_not_expired?(@url)
15
+ return JobParser.cache.fetch_result_for_url(@url)
16
+ end
17
+ end
18
+ end
19
+
20
+ @doc = strip_bad_elements(Nokogiri::HTML(@html))
21
+ @plain_text = get_plain_text
22
+
23
+ result = { :url => @url,
15
24
  :salary => job_salary,
16
25
  :title => job_title,
17
26
  :apply => apply_link,
@@ -19,10 +28,17 @@ module JobParser
19
28
  :location => job_location,
20
29
  :deadline => deadline
21
30
  }
31
+
32
+ store_result_to_cache(result) if JobParser.config[:cache_on]
33
+ result
22
34
  end
23
35
 
24
36
  private
25
37
 
38
+ def store_result_to_cache(result)
39
+ JobParser.cache.store_to_file(result)
40
+ end
41
+
26
42
  def strip_bad_elements(doc)
27
43
  blacklist = ['script', 'style', 'button']
28
44
  blacklist.each do |tag|
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.5.2"
2
+ VERSION = "0.6.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -6,6 +6,7 @@ require "jobparser/parseurl"
6
6
  require "jobparser/cleaner"
7
7
  require "jobparser/scorer"
8
8
  require "jobparser/specialcases"
9
+ require "jobparser/cache"
9
10
  require "jobparser/facets/facet"
10
11
  require "jobparser/facets/salary"
11
12
  require "jobparser/facets/salarystring"
@@ -19,10 +20,34 @@ require "open-uri"
19
20
  module JobParser
20
21
  def self.parser(url)
21
22
  html = open(url, :allow_redirections => :safe).read
23
+
22
24
  if html.include?("http://schema.org/JobPosting")
23
25
  ParseSchema.new(html, url)
24
26
  else
25
27
  ParseHtml.new(html, url)
26
28
  end
27
29
  end
30
+
31
+ def self.config
32
+ @config
33
+ end
34
+
35
+ def self.cache
36
+ @cache
37
+ end
38
+
39
+ def self.configure(opts = {})
40
+ opts.each do |key, val|
41
+ @config[key.to_sym] = val if @config.keys.include?(key.to_sym)
42
+ end
43
+ end
44
+
45
+ @cache = Cache.new
46
+
47
+ @config = {
48
+ :cache_on => false,
49
+ :cache_expire => (1 * 60 * 60), # an hour
50
+ :cache_location => "cache"
51
+ }
52
+
28
53
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-02 00:00:00.000000000 Z
12
+ date: 2013-08-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: timecop
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
62
78
  - !ruby/object:Gem::Dependency
63
79
  name: open_uri_redirections
64
80
  requirement: !ruby/object:Gem::Requirement
@@ -98,6 +114,7 @@ executables: []
98
114
  extensions: []
99
115
  extra_rdoc_files: []
100
116
  files:
117
+ - lib/jobparser/cache.rb
101
118
  - lib/jobparser/cleaner.rb
102
119
  - lib/jobparser/facets/apply.rb
103
120
  - lib/jobparser/facets/deadline.rb