jobparser 0.13.8 → 0.13.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/cache/mongostore.rb +28 -30
- data/lib/jobparser/cache/textfile.rb +17 -14
- data/lib/jobparser/parsehtml.rb +7 -7
- data/lib/jobparser/parser.rb +7 -5
- data/lib/jobparser/version.rb +1 -1
- metadata +2 -2
@@ -4,51 +4,46 @@ module JobParser
|
|
4
4
|
class MongoStore
|
5
5
|
|
6
6
|
def has_cache_for_url?(url)
|
7
|
-
|
7
|
+
job_for_url(url).count > 0
|
8
8
|
end
|
9
9
|
|
10
10
|
def store(hash)
|
11
|
-
|
12
|
-
hash =
|
11
|
+
job_for_url(hash[:url]).delete
|
12
|
+
hash = strip_fields_not_stored(hash)
|
13
13
|
Job.create(hash)
|
14
14
|
end
|
15
15
|
|
16
16
|
def cache_expired?(url)
|
17
|
-
job =
|
17
|
+
job = job_for_url(url).first
|
18
18
|
expire_time = (job.created_at + JobParser.config[:cache_expire])
|
19
19
|
Time.now > expire_time
|
20
20
|
end
|
21
21
|
|
22
22
|
def get(url)
|
23
|
-
job =
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
new_obj[k.to_sym] = v
|
23
|
+
job = job_for_url(url).first
|
24
|
+
{}.tap do |job_obj|
|
25
|
+
job.attributes.each do |k, v|
|
26
|
+
job_obj[k.to_sym] = v unless %w{created_at _id updated_at}.include?(k)
|
28
27
|
end
|
29
|
-
|
30
|
-
|
31
|
-
new_obj
|
28
|
+
job_obj[:from_cache] = true
|
29
|
+
end
|
32
30
|
end
|
33
31
|
|
34
32
|
def clear_all
|
35
|
-
MongoStore::Job.each
|
36
|
-
job.delete
|
37
|
-
end
|
33
|
+
MongoStore::Job.each(&:delete)
|
38
34
|
end
|
39
35
|
|
40
36
|
def view_cache
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
[].tap do |res|
|
38
|
+
Job.each do |job|
|
39
|
+
res.push({
|
40
|
+
:url => job.url,
|
41
|
+
:created => job.created_at
|
42
|
+
})
|
43
|
+
end
|
47
44
|
end
|
48
|
-
res
|
49
45
|
end
|
50
46
|
|
51
|
-
|
52
47
|
class Job
|
53
48
|
include Mongoid::Document
|
54
49
|
include Mongoid::Timestamps
|
@@ -57,6 +52,7 @@ module JobParser
|
|
57
52
|
benefits education_requirements incentives industry
|
58
53
|
occupational_category qualifications responsibilities skills special_commitments work_hours
|
59
54
|
}
|
55
|
+
|
60
56
|
EXTRA_SCHEMA_TEXT_FIELDS.each do |f|
|
61
57
|
field f.to_sym, :type => String
|
62
58
|
end
|
@@ -68,20 +64,22 @@ module JobParser
|
|
68
64
|
field :salary_string, :type => String
|
69
65
|
field :location, :type => String
|
70
66
|
field :deadline, :type => String
|
71
|
-
field :postcode,
|
67
|
+
field :postcode, :type => String
|
72
68
|
field :schema, :type => Boolean, :default => false
|
73
|
-
|
74
69
|
end
|
75
70
|
|
76
71
|
private
|
77
72
|
|
78
|
-
def
|
79
|
-
|
80
|
-
excluded_fields = [:from_cache]
|
81
|
-
hash.each { |k, v| new_hash[k] = v unless excluded_fields.include?(k) }
|
82
|
-
new_hash
|
73
|
+
def job_for_url(url)
|
74
|
+
Job.where(:url => url)
|
83
75
|
end
|
84
76
|
|
77
|
+
def strip_fields_not_stored(hash)
|
78
|
+
{}.tap do |new_hash|
|
79
|
+
excluded_fields = [:from_cache]
|
80
|
+
hash.each { |k, v| new_hash[k] = v unless excluded_fields.include?(k) }
|
81
|
+
end
|
82
|
+
end
|
85
83
|
end
|
86
84
|
end
|
87
85
|
end
|
@@ -4,20 +4,17 @@ module JobParser
|
|
4
4
|
class Cache
|
5
5
|
class TextFile
|
6
6
|
def has_cache_for_url?(url)
|
7
|
-
|
8
|
-
File.exist?(path)
|
7
|
+
File.exist?(path_for_url(url))
|
9
8
|
end
|
10
9
|
|
11
10
|
def store(job_hash)
|
12
|
-
|
13
|
-
write_to_file(path_for_url(url), job_hash.to_json)
|
11
|
+
write_to_file(path_for_url(job_hash[:url]), job_hash.to_json)
|
14
12
|
end
|
15
13
|
|
16
14
|
def get(url)
|
17
15
|
path = path_for_url(url)
|
18
16
|
obj = JSON.parse(IO.read(path))
|
19
|
-
sym_obj =
|
20
|
-
obj.each { |k, v| sym_obj[k.to_sym] = v }
|
17
|
+
sym_obj = make_object_keys_symbols(obj)
|
21
18
|
sym_obj[:from_cache] = true
|
22
19
|
sym_obj
|
23
20
|
end
|
@@ -33,19 +30,25 @@ module JobParser
|
|
33
30
|
end
|
34
31
|
|
35
32
|
def view_cache
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
33
|
+
[].tap do |res|
|
34
|
+
cache_files.each do |f|
|
35
|
+
contents = JSON.parse(IO.read(f))
|
36
|
+
res.push({
|
37
|
+
:url => contents["url"],
|
38
|
+
:created => File.mtime(f)
|
39
|
+
})
|
40
|
+
end
|
43
41
|
end
|
44
|
-
res
|
45
42
|
end
|
46
43
|
|
47
44
|
private
|
48
45
|
|
46
|
+
def make_object_keys_symbols(obj)
|
47
|
+
{}.tap do |sym_obj|
|
48
|
+
obj.each { |k, v| sym_obj[k.to_sym] = v }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
49
52
|
def cache_files
|
50
53
|
Dir[File.join(JobParser.config[:cache_location], "*.txt")]
|
51
54
|
end
|
data/lib/jobparser/parsehtml.rb
CHANGED
@@ -10,31 +10,31 @@ module JobParser
|
|
10
10
|
private
|
11
11
|
|
12
12
|
def job_location
|
13
|
-
Facets::Location.new(
|
13
|
+
Facets::Location.new(*facet_args).parse
|
14
14
|
end
|
15
15
|
|
16
16
|
def job_salary_string
|
17
|
-
Facets::SalaryString.new(
|
17
|
+
Facets::SalaryString.new(*facet_args).parse
|
18
18
|
end
|
19
19
|
|
20
20
|
def job_salary
|
21
|
-
Facets::Salary.new(
|
21
|
+
Facets::Salary.new(*facet_args).parse
|
22
22
|
end
|
23
23
|
|
24
24
|
def job_title
|
25
|
-
Facets::Title.new(
|
25
|
+
Facets::Title.new(*facet_args).parse
|
26
26
|
end
|
27
27
|
|
28
28
|
def apply_link
|
29
|
-
Facets::Apply.new(
|
29
|
+
Facets::Apply.new(*facet_args).parse
|
30
30
|
end
|
31
31
|
|
32
32
|
def deadline
|
33
|
-
Facets::Deadline.new(
|
33
|
+
Facets::Deadline.new(*facet_args).parse
|
34
34
|
end
|
35
35
|
|
36
36
|
def job_postcode
|
37
|
-
Facets::Postcode.new(
|
37
|
+
Facets::Postcode.new(*facet_args).parse
|
38
38
|
end
|
39
39
|
end
|
40
40
|
end
|
data/lib/jobparser/parser.rb
CHANGED
@@ -6,6 +6,8 @@ module JobParser
|
|
6
6
|
def initialize(html, from_url)
|
7
7
|
@url = from_url
|
8
8
|
@html = html
|
9
|
+
@doc = strip_bad_elements(Nokogiri::HTML(@html))
|
10
|
+
@plain_text = get_plain_text
|
9
11
|
end
|
10
12
|
|
11
13
|
def job
|
@@ -13,10 +15,7 @@ module JobParser
|
|
13
15
|
return JobParser.cache.fetch_result_for_url(@url)
|
14
16
|
end
|
15
17
|
|
16
|
-
|
17
|
-
@plain_text = get_plain_text
|
18
|
-
|
19
|
-
result = { :url => @url,
|
18
|
+
{ :url => @url,
|
20
19
|
:salary => job_salary,
|
21
20
|
:title => job_title,
|
22
21
|
:apply => apply_link,
|
@@ -25,11 +24,14 @@ module JobParser
|
|
25
24
|
:deadline => deadline,
|
26
25
|
:postcode => job_postcode
|
27
26
|
}
|
28
|
-
result
|
29
27
|
end
|
30
28
|
|
31
29
|
private
|
32
30
|
|
31
|
+
def facet_args
|
32
|
+
[@doc, @url, @plain_text]
|
33
|
+
end
|
34
|
+
|
33
35
|
def cache(result)
|
34
36
|
if JobParser.config[:cache_on]
|
35
37
|
store_result_to_cache(result)
|
data/lib/jobparser/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|