kabutops 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9f6f70b8bec88cff6caf4d04cb1e24e058e411c0
|
4
|
+
data.tar.gz: c7c77ccc1785e79e2b18b7c00aeae9d8d80954d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b85f06cbc5310797e30d7cbf4397a4ae08843e4377f478fffecb4a28db4f80884c3a9bf77542e257480fdb2c6a9ca940d35274d215b6eb0fbbf7004fad4ed39
|
7
|
+
data.tar.gz: f77ef30437460ca717e9e87dcdae84ff5509ff25c3983783bec803d624f7721ca796e3ec222934549b82a68c07821cbb8f6af0f74ee977fe8a91a78557bca4a4
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -64,22 +64,13 @@ module Kabutops
|
|
64
64
|
resource = Hashie::Mash.new(resource)
|
65
65
|
|
66
66
|
adapters = self.class.adapters.select do |adapter|
|
67
|
-
|
68
|
-
adapter.find(resource).nil?
|
69
|
-
else
|
70
|
-
true
|
71
|
-
end
|
67
|
+
params.skip_existing ? adapter.find(resource).nil? : true
|
72
68
|
end
|
73
69
|
|
74
70
|
return if adapters.nil?
|
75
|
-
|
76
71
|
page = crawl(resource)
|
77
|
-
|
78
72
|
return if page.nil?
|
79
|
-
|
80
|
-
save = (self.class.notify(:store_if, resource, page) || []).all?
|
81
|
-
|
82
|
-
return unless save
|
73
|
+
return unless (self.class.notify(:store_if, resource, page) || []).all?
|
83
74
|
|
84
75
|
adapters.each do |adapter|
|
85
76
|
adapter.process(resource, page)
|
@@ -105,19 +96,7 @@ module Kabutops
|
|
105
96
|
end
|
106
97
|
|
107
98
|
def crawl resource
|
108
|
-
page =
|
109
|
-
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
110
|
-
|
111
|
-
content = Cachy.cache_if(params.cache, cache_key) do
|
112
|
-
sleep params[:wait] || 0 # wait only if value is not from cache
|
113
|
-
body = agent.get(resource[:url]).body
|
114
|
-
body.encode!('utf-8', params[:encoding]) if params[:encoding]
|
115
|
-
page = Nokogiri::HTML(body)
|
116
|
-
self.class.notify(:before_cache, resource, page)
|
117
|
-
body
|
118
|
-
end
|
119
|
-
|
120
|
-
page = Nokogiri::HTML(content) if page.nil?
|
99
|
+
page = get_cache_or_hit(resource)
|
121
100
|
self.class.notify(:after_crawl, resource, page)
|
122
101
|
page
|
123
102
|
rescue Mechanize::ResponseCodeError => e
|
@@ -129,6 +108,26 @@ module Kabutops
|
|
129
108
|
end
|
130
109
|
end
|
131
110
|
|
111
|
+
def get_cache_or_hit resource
|
112
|
+
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
113
|
+
page = nil
|
114
|
+
|
115
|
+
content = Cachy.cache_if(params.cache, cache_key) do
|
116
|
+
sleep params[:wait] || 0 # wait only if value is not from cache
|
117
|
+
page = get_page(resource[:url])
|
118
|
+
self.class.notify(:before_cache, resource, page)
|
119
|
+
page.to_s
|
120
|
+
end
|
121
|
+
|
122
|
+
page ? page : Nokogiri::HTML(content)
|
123
|
+
end
|
124
|
+
|
125
|
+
def get_page url
|
126
|
+
body = agent.get(url).body
|
127
|
+
body.encode!('utf-8', params[:encoding]) if params[:encoding]
|
128
|
+
Nokogiri::HTML(body)
|
129
|
+
end
|
130
|
+
|
132
131
|
def agent
|
133
132
|
if params[:agent].is_a?(Proc)
|
134
133
|
@agent = params[:agent].call
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-09-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 2.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 2.6.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: cachy
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -249,3 +249,4 @@ specification_version: 4
|
|
249
249
|
summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
|
250
250
|
for an anonymity.
|
251
251
|
test_files: []
|
252
|
+
has_rdoc:
|