kabutops 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kabutops/crawler.rb +9 -4
- data/lib/kabutops/spider.rb +6 -2
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b4f48640fb6de941353f09dee76176c4ce9ff07
|
4
|
+
data.tar.gz: 41173f3f0826601540cdcd783da88c4c4dae9169
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72097b1fa2b063c628fe40c1f02557c40ee4eba0c25e020140511e6154b1e3bcf5b05d89d07c1cd10aaeec6b1b54f03fccd9b25415bf7bfa11661c2403d1a995
|
7
|
+
data.tar.gz: c1d42e0efdcc0ba81dfd3fe7c885786c7e639d7a022c00d9999bfa48cd1c75d7d0cc7b8be13c143ce9e84ee017ce3571f24ecc024967c3fd9859af628d296abd
|
data/README.md
CHANGED
data/lib/kabutops/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Kabutops
|
|
17
17
|
|
18
18
|
params :collection, :proxy, :cache, :wait,
|
19
19
|
:skip_existing, :agent
|
20
|
-
callbacks :after_crawl
|
20
|
+
callbacks :after_crawl, :before_cache
|
21
21
|
|
22
22
|
def adapters
|
23
23
|
@adapters ||= []
|
@@ -99,13 +99,18 @@ module Kabutops
|
|
99
99
|
end
|
100
100
|
|
101
101
|
def crawl resource
|
102
|
-
|
102
|
+
page = nil
|
103
|
+
cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
|
104
|
+
|
103
105
|
content = Cachy.cache_if(params.cache, cache_key) do
|
104
106
|
sleep params[:wait] || 0 # wait only if value is not from cache
|
105
|
-
agent.get(resource[:url]).body
|
107
|
+
body = agent.get(resource[:url]).body
|
108
|
+
page = Nokogiri::HTML(body)
|
109
|
+
self.class.notify(:before_cache, resource, page)
|
110
|
+
body
|
106
111
|
end
|
107
112
|
|
108
|
-
page = Nokogiri::HTML(content)
|
113
|
+
page = Nokogiri::HTML(content) if page.nil?
|
109
114
|
self.class.notify(:after_crawl, resource, page)
|
110
115
|
page
|
111
116
|
end
|
data/lib/kabutops/spider.rb
CHANGED
@@ -5,7 +5,7 @@ module Kabutops
|
|
5
5
|
class Spider < Crawler
|
6
6
|
class << self
|
7
7
|
params :url
|
8
|
-
callbacks :after_crawl, :follow_if
|
8
|
+
callbacks :after_crawl, :before_cache, :follow_if
|
9
9
|
|
10
10
|
def debug_spider
|
11
11
|
enable_debug
|
@@ -14,6 +14,10 @@ module Kabutops
|
|
14
14
|
})
|
15
15
|
end
|
16
16
|
|
17
|
+
def crawl collection=nil
|
18
|
+
super(collection || [{ url: params.url, }])
|
19
|
+
end
|
20
|
+
|
17
21
|
def << resource
|
18
22
|
if resource_status(resource).nil?
|
19
23
|
resource_status(resource, 'new')
|
@@ -72,7 +76,7 @@ module Kabutops
|
|
72
76
|
follow = self.class.notify(:follow_if, a['href']).any?
|
73
77
|
if follow
|
74
78
|
self << {
|
75
|
-
url: a['href']
|
79
|
+
url: URI.join(params.url, URI.escape(a['href'])).to_s
|
76
80
|
}
|
77
81
|
end
|
78
82
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|