jobparser 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ require 'digest/md5'
2
+ require 'json'
3
+
4
+ module JobParser
5
+ class Cache
6
+
7
+ def has_cache_for_url?(url)
8
+ path = path_for_url(url)
9
+ File.exist?(path)
10
+ end
11
+
12
+ def fetch_result_for_url(url)
13
+ path = path_for_url(url)
14
+ obj = JSON.parse(IO.read(path))
15
+ sym_obj = {}
16
+ obj.each { |k, v| sym_obj[k.to_sym] = v }
17
+ sym_obj[:from_hash] = true
18
+ sym_obj
19
+ end
20
+
21
+ def store_to_file(job_hash)
22
+ url = job_hash[:url]
23
+ write_to_file(path_for_url(url), job_hash.to_json)
24
+ end
25
+
26
+ def cache_expired?(url)
27
+ !cache_not_expired?(url)
28
+ end
29
+
30
+ def cache_not_expired?(url)
31
+ time = File.mtime(path_for_url(url))
32
+ expire_time = time + JobParser.config[:cache_expire]
33
+ Time.now < expire_time
34
+ end
35
+
36
+ def clear_all
37
+ files = Dir[File.join(JobParser.config[:cache_location], "*.txt")]
38
+ files.each { |f| File.delete(f) }
39
+ end
40
+
41
+ private
42
+
43
+
44
+ def write_to_file(path, contents)
45
+ File.open(path, "w") { |f| f.puts(contents) }
46
+ end
47
+
48
+ def path_for_url(url)
49
+ cache_dir = JobParser.config[:cache_location]
50
+ File.join(cache_dir, md5_url(url))
51
+ end
52
+
53
+ def md5_url(url)
54
+ "#{Digest::MD5.hexdigest(url)}.txt"
55
+ end
56
+ end
57
+ end
@@ -2,16 +2,25 @@ require "nokogiri"
2
2
  module JobParser
3
3
  class Parser
4
4
  ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
5
- attr_reader :doc, :plain_text
6
5
 
7
6
  def initialize(html, from_url)
8
7
  @url = from_url
9
- @doc = strip_bad_elements(Nokogiri::HTML(html))
10
- @plain_text = get_plain_text
8
+ @html = html
11
9
  end
12
10
 
13
11
  def job
14
- { :url => @url,
12
+ if JobParser.config[:cache_on]
13
+ if JobParser.cache.has_cache_for_url?(@url)
14
+ if JobParser.cache.cache_not_expired?(@url)
15
+ return JobParser.cache.fetch_result_for_url(@url)
16
+ end
17
+ end
18
+ end
19
+
20
+ @doc = strip_bad_elements(Nokogiri::HTML(@html))
21
+ @plain_text = get_plain_text
22
+
23
+ result = { :url => @url,
15
24
  :salary => job_salary,
16
25
  :title => job_title,
17
26
  :apply => apply_link,
@@ -19,10 +28,17 @@ module JobParser
19
28
  :location => job_location,
20
29
  :deadline => deadline
21
30
  }
31
+
32
+ store_result_to_cache(result) if JobParser.config[:cache_on]
33
+ result
22
34
  end
23
35
 
24
36
  private
25
37
 
38
+ def store_result_to_cache(result)
39
+ JobParser.cache.store_to_file(result)
40
+ end
41
+
26
42
  def strip_bad_elements(doc)
27
43
  blacklist = ['script', 'style', 'button']
28
44
  blacklist.each do |tag|
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.5.2"
2
+ VERSION = "0.6.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -6,6 +6,7 @@ require "jobparser/parseurl"
6
6
  require "jobparser/cleaner"
7
7
  require "jobparser/scorer"
8
8
  require "jobparser/specialcases"
9
+ require "jobparser/cache"
9
10
  require "jobparser/facets/facet"
10
11
  require "jobparser/facets/salary"
11
12
  require "jobparser/facets/salarystring"
@@ -19,10 +20,34 @@ require "open-uri"
19
20
  module JobParser
20
21
  def self.parser(url)
21
22
  html = open(url, :allow_redirections => :safe).read
23
+
22
24
  if html.include?("http://schema.org/JobPosting")
23
25
  ParseSchema.new(html, url)
24
26
  else
25
27
  ParseHtml.new(html, url)
26
28
  end
27
29
  end
30
+
31
+ def self.config
32
+ @config
33
+ end
34
+
35
+ def self.cache
36
+ @cache
37
+ end
38
+
39
+ def self.configure(opts = {})
40
+ opts.each do |key, val|
41
+ @config[key.to_sym] = val if @config.keys.include?(key.to_sym)
42
+ end
43
+ end
44
+
45
+ @cache = Cache.new
46
+
47
+ @config = {
48
+ :cache_on => false,
49
+ :cache_expire => (1 * 60 * 60), # an hour
50
+ :cache_location => "cache"
51
+ }
52
+
28
53
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-02 00:00:00.000000000 Z
12
+ date: 2013-08-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: timecop
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
62
78
  - !ruby/object:Gem::Dependency
63
79
  name: open_uri_redirections
64
80
  requirement: !ruby/object:Gem::Requirement
@@ -98,6 +114,7 @@ executables: []
98
114
  extensions: []
99
115
  extra_rdoc_files: []
100
116
  files:
117
+ - lib/jobparser/cache.rb
101
118
  - lib/jobparser/cleaner.rb
102
119
  - lib/jobparser/facets/apply.rb
103
120
  - lib/jobparser/facets/deadline.rb