kabutops 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9f6f70b8bec88cff6caf4d04cb1e24e058e411c0
|
4
|
+
data.tar.gz: c7c77ccc1785e79e2b18b7c00aeae9d8d80954d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b85f06cbc5310797e30d7cbf4397a4ae08843e4377f478fffecb4a28db4f80884c3a9bf77542e257480fdb2c6a9ca940d35274d215b6eb0fbbf7004fad4ed39
|
7
|
+
data.tar.gz: f77ef30437460ca717e9e87dcdae84ff5509ff25c3983783bec803d624f7721ca796e3ec222934549b82a68c07821cbb8f6af0f74ee977fe8a91a78557bca4a4
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -64,22 +64,13 @@ module Kabutops
|
|
64
64
|
resource = Hashie::Mash.new(resource)
|
65
65
|
|
66
66
|
adapters = self.class.adapters.select do |adapter|
|
67
|
-
|
68
|
-
adapter.find(resource).nil?
|
69
|
-
else
|
70
|
-
true
|
71
|
-
end
|
67
|
+
params.skip_existing ? adapter.find(resource).nil? : true
|
72
68
|
end
|
73
69
|
|
74
70
|
return if adapters.nil?
|
75
|
-
|
76
71
|
page = crawl(resource)
|
77
|
-
|
78
72
|
return if page.nil?
|
79
|
-
|
80
|
-
save = (self.class.notify(:store_if, resource, page) || []).all?
|
81
|
-
|
82
|
-
return unless save
|
73
|
+
return unless (self.class.notify(:store_if, resource, page) || []).all?
|
83
74
|
|
84
75
|
adapters.each do |adapter|
|
85
76
|
adapter.process(resource, page)
|
@@ -105,19 +96,7 @@ module Kabutops
|
|
105
96
|
end
|
106
97
|
|
107
98
|
def crawl resource
|
108
|
-
page =
|
109
|
-
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
110
|
-
|
111
|
-
content = Cachy.cache_if(params.cache, cache_key) do
|
112
|
-
sleep params[:wait] || 0 # wait only if value is not from cache
|
113
|
-
body = agent.get(resource[:url]).body
|
114
|
-
body.encode!('utf-8', params[:encoding]) if params[:encoding]
|
115
|
-
page = Nokogiri::HTML(body)
|
116
|
-
self.class.notify(:before_cache, resource, page)
|
117
|
-
body
|
118
|
-
end
|
119
|
-
|
120
|
-
page = Nokogiri::HTML(content) if page.nil?
|
99
|
+
page = get_cache_or_hit(resource)
|
121
100
|
self.class.notify(:after_crawl, resource, page)
|
122
101
|
page
|
123
102
|
rescue Mechanize::ResponseCodeError => e
|
@@ -129,6 +108,26 @@ module Kabutops
|
|
129
108
|
end
|
130
109
|
end
|
131
110
|
|
111
|
+
def get_cache_or_hit resource
|
112
|
+
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
113
|
+
page = nil
|
114
|
+
|
115
|
+
content = Cachy.cache_if(params.cache, cache_key) do
|
116
|
+
sleep params[:wait] || 0 # wait only if value is not from cache
|
117
|
+
page = get_page(resource[:url])
|
118
|
+
self.class.notify(:before_cache, resource, page)
|
119
|
+
page.to_s
|
120
|
+
end
|
121
|
+
|
122
|
+
page ? page : Nokogiri::HTML(content)
|
123
|
+
end
|
124
|
+
|
125
|
+
def get_page url
|
126
|
+
body = agent.get(url).body
|
127
|
+
body.encode!('utf-8', params[:encoding]) if params[:encoding]
|
128
|
+
Nokogiri::HTML(body)
|
129
|
+
end
|
130
|
+
|
132
131
|
def agent
|
133
132
|
if params[:agent].is_a?(Proc)
|
134
133
|
@agent = params[:agent].call
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 2.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 2.6.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: cachy
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -249,3 +249,4 @@ specification_version: 4
|
|
249
249
|
summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
|
250
250
|
for an anonymity.
|
251
251
|
test_files: []
|
252
|
+
has_rdoc:
|