kabutops 0.0.15 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kabutops/adapters/database_adapter.rb +13 -13
- data/lib/kabutops/crawler.rb +5 -1
- data/lib/kabutops/recipe.rb +13 -2
- data/lib/kabutops/spider.rb +14 -5
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05df1a21e10c723541c9cc4514174959b794dc27
|
4
|
+
data.tar.gz: 6d60b8c1b4acf024f50b8c363b905c8d167f503c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cf88297902defb328c85d39233af7a8f11fc20935c193a4a9cc568d38c844f3657973ea58425db4d719b6faa943df7edc71e179e06425ef39911cadf5599d21
|
7
|
+
data.tar.gz: 47e08167bb5129ef665310c2d332c57ae071d9755a2c5f133b03b158d1ae1520126ef1ab8ff58c2f70564bd93ac6ccac0a613e2eae49e8ca657d63bfe6024814
|
data/README.md
CHANGED
@@ -12,25 +12,25 @@ module Kabutops
|
|
12
12
|
|
13
13
|
callbacks :after_save, :save_if
|
14
14
|
|
15
|
-
def data &block
|
16
|
-
@recipe = Recipe.new
|
15
|
+
def data params={}, &block
|
16
|
+
@recipe = Recipe.new(params)
|
17
17
|
@recipe.instance_eval &block
|
18
18
|
end
|
19
19
|
|
20
20
|
def process resource, page
|
21
21
|
raise 'data block not defined' unless @recipe
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
[@recipe.process(resource, page)].flatten.each do |result|
|
24
|
+
result.update(updated_at: Time.now.to_i)
|
25
|
+
save = (notify(:save_if, resource, page, result) || []).all?
|
26
|
+
|
27
|
+
if debug
|
28
|
+
logger.info("#{self.class.to_s} outputs:")
|
29
|
+
logger.info(save ? result.to_hash : 'not valid for save')
|
30
|
+
elsif save
|
31
|
+
store(result)
|
32
|
+
notify(:after_save, result)
|
33
|
+
end
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Kabutops
|
|
17
17
|
|
18
18
|
params :collection, :proxy, :cache, :wait,
|
19
19
|
:skip_existing, :agent
|
20
|
-
callbacks :after_crawl, :before_cache
|
20
|
+
callbacks :after_crawl, :before_cache, :store_if
|
21
21
|
|
22
22
|
def adapters
|
23
23
|
@adapters ||= []
|
@@ -77,6 +77,10 @@ module Kabutops
|
|
77
77
|
|
78
78
|
return if page.nil?
|
79
79
|
|
80
|
+
save = (self.class.notify(:store_if, resource, page) || []).all?
|
81
|
+
|
82
|
+
return unless save
|
83
|
+
|
80
84
|
adapters.each do |adapter|
|
81
85
|
adapter.process(resource, page)
|
82
86
|
end
|
data/lib/kabutops/recipe.rb
CHANGED
@@ -5,7 +5,8 @@ module Kabutops
|
|
5
5
|
class Recipe
|
6
6
|
attr_reader :items
|
7
7
|
|
8
|
-
def initialize
|
8
|
+
def initialize params={}
|
9
|
+
@params = Hashie::Mash.new(params)
|
9
10
|
@items = Hashie::Mash.new
|
10
11
|
@nested = false
|
11
12
|
end
|
@@ -23,10 +24,20 @@ module Kabutops
|
|
23
24
|
end
|
24
25
|
|
25
26
|
def process resource, page
|
27
|
+
if @params[:each]
|
28
|
+
page.xpath(@params[:each]).map{ |n| process_one(resource, n) }
|
29
|
+
elsif @params[:each_css]
|
30
|
+
page.css(@params[:each_css]).map{ |n| process_one(resource, n) }
|
31
|
+
else
|
32
|
+
process_one(resource, page)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def process_one resource, node
|
26
37
|
result = Hashie::Mash.new
|
27
38
|
|
28
39
|
@items.each do |name, item|
|
29
|
-
result[name] = item.process(resource,
|
40
|
+
result[name] = item.process(resource, node)
|
30
41
|
end
|
31
42
|
|
32
43
|
result
|
data/lib/kabutops/spider.rb
CHANGED
@@ -18,6 +18,11 @@ module Kabutops
|
|
18
18
|
super(collection || [{ url: params.url, }])
|
19
19
|
end
|
20
20
|
|
21
|
+
def reset!
|
22
|
+
super
|
23
|
+
redis.keys.each{ |k| redis.del(k) }
|
24
|
+
end
|
25
|
+
|
21
26
|
def << resource
|
22
27
|
if resource_status(resource).nil?
|
23
28
|
resource_status(resource, 'new')
|
@@ -25,6 +30,12 @@ module Kabutops
|
|
25
30
|
end
|
26
31
|
end
|
27
32
|
|
33
|
+
def follow link
|
34
|
+
self << {
|
35
|
+
url: URI.join(params.url, URI.escape(link)).to_s
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
28
39
|
def resource_status resource, status=nil
|
29
40
|
url_status(resource[:url], status)
|
30
41
|
end
|
@@ -73,12 +84,10 @@ module Kabutops
|
|
73
84
|
|
74
85
|
def after_crawl resource, page
|
75
86
|
page.css('a').each do |a|
|
87
|
+
next if a['href'].nil?
|
88
|
+
|
76
89
|
follow = self.class.notify(:follow_if, a['href']).any?
|
77
|
-
if follow
|
78
|
-
self << {
|
79
|
-
url: URI.join(params.url, URI.escape(a['href'])).to_s
|
80
|
-
}
|
81
|
-
end
|
90
|
+
self.class.follow(a['href']) if follow
|
82
91
|
end
|
83
92
|
end
|
84
93
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|