jobparser 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/cache.rb +57 -0
- data/lib/jobparser/parser.rb +20 -4
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +25 -0
- metadata +19 -2
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module JobParser
|
5
|
+
class Cache
|
6
|
+
|
7
|
+
def has_cache_for_url?(url)
|
8
|
+
path = path_for_url(url)
|
9
|
+
File.exist?(path)
|
10
|
+
end
|
11
|
+
|
12
|
+
def fetch_result_for_url(url)
|
13
|
+
path = path_for_url(url)
|
14
|
+
obj = JSON.parse(IO.read(path))
|
15
|
+
sym_obj = {}
|
16
|
+
obj.each { |k, v| sym_obj[k.to_sym] = v }
|
17
|
+
sym_obj[:from_hash] = true
|
18
|
+
sym_obj
|
19
|
+
end
|
20
|
+
|
21
|
+
def store_to_file(job_hash)
|
22
|
+
url = job_hash[:url]
|
23
|
+
write_to_file(path_for_url(url), job_hash.to_json)
|
24
|
+
end
|
25
|
+
|
26
|
+
def cache_expired?(url)
|
27
|
+
!cache_not_expired?(url)
|
28
|
+
end
|
29
|
+
|
30
|
+
def cache_not_expired?(url)
|
31
|
+
time = File.mtime(path_for_url(url))
|
32
|
+
expire_time = time + JobParser.config[:cache_expire]
|
33
|
+
Time.now < expire_time
|
34
|
+
end
|
35
|
+
|
36
|
+
def clear_all
|
37
|
+
files = Dir[File.join(JobParser.config[:cache_location], "*.txt")]
|
38
|
+
files.each { |f| File.delete(f) }
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
|
44
|
+
def write_to_file(path, contents)
|
45
|
+
File.open(path, "w") { |f| f.puts(contents) }
|
46
|
+
end
|
47
|
+
|
48
|
+
def path_for_url(url)
|
49
|
+
cache_dir = JobParser.config[:cache_location]
|
50
|
+
File.join(cache_dir, md5_url(url))
|
51
|
+
end
|
52
|
+
|
53
|
+
def md5_url(url)
|
54
|
+
"#{Digest::MD5.hexdigest(url)}.txt"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/jobparser/parser.rb
CHANGED
@@ -2,16 +2,25 @@ require "nokogiri"
|
|
2
2
|
module JobParser
|
3
3
|
class Parser
|
4
4
|
ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
|
5
|
-
attr_reader :doc, :plain_text
|
6
5
|
|
7
6
|
def initialize(html, from_url)
|
8
7
|
@url = from_url
|
9
|
-
@
|
10
|
-
@plain_text = get_plain_text
|
8
|
+
@html = html
|
11
9
|
end
|
12
10
|
|
13
11
|
def job
|
14
|
-
|
12
|
+
if JobParser.config[:cache_on]
|
13
|
+
if JobParser.cache.has_cache_for_url?(@url)
|
14
|
+
if JobParser.cache.cache_not_expired?(@url)
|
15
|
+
return JobParser.cache.fetch_result_for_url(@url)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
@doc = strip_bad_elements(Nokogiri::HTML(@html))
|
21
|
+
@plain_text = get_plain_text
|
22
|
+
|
23
|
+
result = { :url => @url,
|
15
24
|
:salary => job_salary,
|
16
25
|
:title => job_title,
|
17
26
|
:apply => apply_link,
|
@@ -19,10 +28,17 @@ module JobParser
|
|
19
28
|
:location => job_location,
|
20
29
|
:deadline => deadline
|
21
30
|
}
|
31
|
+
|
32
|
+
store_result_to_cache(result) if JobParser.config[:cache_on]
|
33
|
+
result
|
22
34
|
end
|
23
35
|
|
24
36
|
private
|
25
37
|
|
38
|
+
def store_result_to_cache(result)
|
39
|
+
JobParser.cache.store_to_file(result)
|
40
|
+
end
|
41
|
+
|
26
42
|
def strip_bad_elements(doc)
|
27
43
|
blacklist = ['script', 'style', 'button']
|
28
44
|
blacklist.each do |tag|
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
@@ -6,6 +6,7 @@ require "jobparser/parseurl"
|
|
6
6
|
require "jobparser/cleaner"
|
7
7
|
require "jobparser/scorer"
|
8
8
|
require "jobparser/specialcases"
|
9
|
+
require "jobparser/cache"
|
9
10
|
require "jobparser/facets/facet"
|
10
11
|
require "jobparser/facets/salary"
|
11
12
|
require "jobparser/facets/salarystring"
|
@@ -19,10 +20,34 @@ require "open-uri"
|
|
19
20
|
module JobParser
|
20
21
|
def self.parser(url)
|
21
22
|
html = open(url, :allow_redirections => :safe).read
|
23
|
+
|
22
24
|
if html.include?("http://schema.org/JobPosting")
|
23
25
|
ParseSchema.new(html, url)
|
24
26
|
else
|
25
27
|
ParseHtml.new(html, url)
|
26
28
|
end
|
27
29
|
end
|
30
|
+
|
31
|
+
def self.config
|
32
|
+
@config
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.cache
|
36
|
+
@cache
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.configure(opts = {})
|
40
|
+
opts.each do |key, val|
|
41
|
+
@config[key.to_sym] = val if @config.keys.include?(key.to_sym)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
@cache = Cache.new
|
46
|
+
|
47
|
+
@config = {
|
48
|
+
:cache_on => false,
|
49
|
+
:cache_expire => (1 * 60 * 60), # an hour
|
50
|
+
:cache_location => "cache"
|
51
|
+
}
|
52
|
+
|
28
53
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -59,6 +59,22 @@ dependencies:
|
|
59
59
|
- - ! '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: timecop
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
62
78
|
- !ruby/object:Gem::Dependency
|
63
79
|
name: open_uri_redirections
|
64
80
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,6 +114,7 @@ executables: []
|
|
98
114
|
extensions: []
|
99
115
|
extra_rdoc_files: []
|
100
116
|
files:
|
117
|
+
- lib/jobparser/cache.rb
|
101
118
|
- lib/jobparser/cleaner.rb
|
102
119
|
- lib/jobparser/facets/apply.rb
|
103
120
|
- lib/jobparser/facets/deadline.rb
|