kabutops 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kabutops/crawler.rb +9 -4
- data/lib/kabutops/spider.rb +6 -2
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b4f48640fb6de941353f09dee76176c4ce9ff07
|
4
|
+
data.tar.gz: 41173f3f0826601540cdcd783da88c4c4dae9169
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72097b1fa2b063c628fe40c1f02557c40ee4eba0c25e020140511e6154b1e3bcf5b05d89d07c1cd10aaeec6b1b54f03fccd9b25415bf7bfa11661c2403d1a995
|
7
|
+
data.tar.gz: c1d42e0efdcc0ba81dfd3fe7c885786c7e639d7a022c00d9999bfa48cd1c75d7d0cc7b8be13c143ce9e84ee017ce3571f24ecc024967c3fd9859af628d296abd
|
data/README.md
CHANGED
data/lib/kabutops/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Kabutops
|
|
17
17
|
|
18
18
|
params :collection, :proxy, :cache, :wait,
|
19
19
|
:skip_existing, :agent
|
20
|
-
callbacks :after_crawl
|
20
|
+
callbacks :after_crawl, :before_cache
|
21
21
|
|
22
22
|
def adapters
|
23
23
|
@adapters ||= []
|
@@ -99,13 +99,18 @@ module Kabutops
|
|
99
99
|
end
|
100
100
|
|
101
101
|
def crawl resource
|
102
|
-
|
102
|
+
page = nil
|
103
|
+
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
104
|
+
|
103
105
|
content = Cachy.cache_if(params.cache, cache_key) do
|
104
106
|
sleep params[:wait] || 0 # wait only if value is not from cache
|
105
|
-
agent.get(resource[:url]).body
|
107
|
+
body = agent.get(resource[:url]).body
|
108
|
+
page = Nokogiri::HTML(body)
|
109
|
+
self.class.notify(:before_cache, resource, page)
|
110
|
+
body
|
106
111
|
end
|
107
112
|
|
108
|
-
page = Nokogiri::HTML(content)
|
113
|
+
page = Nokogiri::HTML(content) if page.nil?
|
109
114
|
self.class.notify(:after_crawl, resource, page)
|
110
115
|
page
|
111
116
|
end
|
data/lib/kabutops/spider.rb
CHANGED
@@ -5,7 +5,7 @@ module Kabutops
|
|
5
5
|
class Spider < Crawler
|
6
6
|
class << self
|
7
7
|
params :url
|
8
|
-
callbacks :after_crawl, :follow_if
|
8
|
+
callbacks :after_crawl, :before_cache, :follow_if
|
9
9
|
|
10
10
|
def debug_spider
|
11
11
|
enable_debug
|
@@ -14,6 +14,10 @@ module Kabutops
|
|
14
14
|
})
|
15
15
|
end
|
16
16
|
|
17
|
+
def crawl collection=nil
|
18
|
+
super(collection || [{ url: params.url, }])
|
19
|
+
end
|
20
|
+
|
17
21
|
def << resource
|
18
22
|
if resource_status(resource).nil?
|
19
23
|
resource_status(resource, 'new')
|
@@ -72,7 +76,7 @@ module Kabutops
|
|
72
76
|
follow = self.class.notify(:follow_if, a['href']).any?
|
73
77
|
if follow
|
74
78
|
self << {
|
75
|
-
url: a['href']
|
79
|
+
url: URI.join(params.url, URI.escape(a['href'])).to_s
|
76
80
|
}
|
77
81
|
end
|
78
82
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|