jobparser 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser/cache.rb +57 -0
- data/lib/jobparser/parser.rb +20 -4
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +25 -0
- metadata +19 -2
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module JobParser
|
5
|
+
class Cache
|
6
|
+
|
7
|
+
def has_cache_for_url?(url)
|
8
|
+
path = path_for_url(url)
|
9
|
+
File.exist?(path)
|
10
|
+
end
|
11
|
+
|
12
|
+
def fetch_result_for_url(url)
|
13
|
+
path = path_for_url(url)
|
14
|
+
obj = JSON.parse(IO.read(path))
|
15
|
+
sym_obj = {}
|
16
|
+
obj.each { |k, v| sym_obj[k.to_sym] = v }
|
17
|
+
sym_obj[:from_hash] = true
|
18
|
+
sym_obj
|
19
|
+
end
|
20
|
+
|
21
|
+
def store_to_file(job_hash)
|
22
|
+
url = job_hash[:url]
|
23
|
+
write_to_file(path_for_url(url), job_hash.to_json)
|
24
|
+
end
|
25
|
+
|
26
|
+
def cache_expired?(url)
|
27
|
+
!cache_not_expired?(url)
|
28
|
+
end
|
29
|
+
|
30
|
+
def cache_not_expired?(url)
|
31
|
+
time = File.mtime(path_for_url(url))
|
32
|
+
expire_time = time + JobParser.config[:cache_expire]
|
33
|
+
Time.now < expire_time
|
34
|
+
end
|
35
|
+
|
36
|
+
def clear_all
|
37
|
+
files = Dir[File.join(JobParser.config[:cache_location], "*.txt")]
|
38
|
+
files.each { |f| File.delete(f) }
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
|
44
|
+
def write_to_file(path, contents)
|
45
|
+
File.open(path, "w") { |f| f.puts(contents) }
|
46
|
+
end
|
47
|
+
|
48
|
+
def path_for_url(url)
|
49
|
+
cache_dir = JobParser.config[:cache_location]
|
50
|
+
File.join(cache_dir, md5_url(url))
|
51
|
+
end
|
52
|
+
|
53
|
+
def md5_url(url)
|
54
|
+
"#{Digest::MD5.hexdigest(url)}.txt"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/jobparser/parser.rb
CHANGED
@@ -2,16 +2,25 @@ require "nokogiri"
|
|
2
2
|
module JobParser
|
3
3
|
class Parser
|
4
4
|
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
5
|
-
attr_reader :doc, :plain_text
|
6
5
|
|
7
6
|
def initialize(html, from_url)
|
8
7
|
@url = from_url
|
9
|
-
@
|
10
|
-
@plain_text = get_plain_text
|
8
|
+
@html = html
|
11
9
|
end
|
12
10
|
|
13
11
|
def job
|
14
|
-
|
12
|
+
if JobParser.config[:cache_on]
|
13
|
+
if JobParser.cache.has_cache_for_url?(@url)
|
14
|
+
if JobParser.cache.cache_not_expired?(@url)
|
15
|
+
return JobParser.cache.fetch_result_for_url(@url)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
@doc = strip_bad_elements(Nokogiri::HTML(@html))
|
21
|
+
@plain_text = get_plain_text
|
22
|
+
|
23
|
+
result = { :url => @url,
|
15
24
|
:salary => job_salary,
|
16
25
|
:title => job_title,
|
17
26
|
:apply => apply_link,
|
@@ -19,10 +28,17 @@ module JobParser
|
|
19
28
|
:location => job_location,
|
20
29
|
:deadline => deadline
|
21
30
|
}
|
31
|
+
|
32
|
+
store_result_to_cache(result) if JobParser.config[:cache_on]
|
33
|
+
result
|
22
34
|
end
|
23
35
|
|
24
36
|
private
|
25
37
|
|
38
|
+
def store_result_to_cache(result)
|
39
|
+
JobParser.cache.store_to_file(result)
|
40
|
+
end
|
41
|
+
|
26
42
|
def strip_bad_elements(doc)
|
27
43
|
blacklist = ['script', 'style', 'button']
|
28
44
|
blacklist.each do |tag|
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
@@ -6,6 +6,7 @@ require "jobparser/parseurl"
|
|
6
6
|
require "jobparser/cleaner"
|
7
7
|
require "jobparser/scorer"
|
8
8
|
require "jobparser/specialcases"
|
9
|
+
require "jobparser/cache"
|
9
10
|
require "jobparser/facets/facet"
|
10
11
|
require "jobparser/facets/salary"
|
11
12
|
require "jobparser/facets/salarystring"
|
@@ -19,10 +20,34 @@ require "open-uri"
|
|
19
20
|
module JobParser
|
20
21
|
def self.parser(url)
|
21
22
|
html = open(url, :allow_redirections => :safe).read
|
23
|
+
|
22
24
|
if html.include?("http://schema.org/JobPosting")
|
23
25
|
ParseSchema.new(html, url)
|
24
26
|
else
|
25
27
|
ParseHtml.new(html, url)
|
26
28
|
end
|
27
29
|
end
|
30
|
+
|
31
|
+
def self.config
|
32
|
+
@config
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.cache
|
36
|
+
@cache
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.configure(opts = {})
|
40
|
+
opts.each do |key, val|
|
41
|
+
@config[key.to_sym] = val if @config.keys.include?(key.to_sym)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
@cache = Cache.new
|
46
|
+
|
47
|
+
@config = {
|
48
|
+
:cache_on => false,
|
49
|
+
:cache_expire => (1 * 60 * 60), # an hour
|
50
|
+
:cache_location => "cache"
|
51
|
+
}
|
52
|
+
|
28
53
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -59,6 +59,22 @@ dependencies:
|
|
59
59
|
- - ! '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: timecop
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
62
78
|
- !ruby/object:Gem::Dependency
|
63
79
|
name: open_uri_redirections
|
64
80
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,6 +114,7 @@ executables: []
|
|
98
114
|
extensions: []
|
99
115
|
extra_rdoc_files: []
|
100
116
|
files:
|
117
|
+
- lib/jobparser/cache.rb
|
101
118
|
- lib/jobparser/cleaner.rb
|
102
119
|
- lib/jobparser/facets/apply.rb
|
103
120
|
- lib/jobparser/facets/deadline.rb
|