kabutops 0.0.15 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kabutops/adapters/database_adapter.rb +13 -13
- data/lib/kabutops/crawler.rb +5 -1
- data/lib/kabutops/recipe.rb +13 -2
- data/lib/kabutops/spider.rb +14 -5
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05df1a21e10c723541c9cc4514174959b794dc27
|
4
|
+
data.tar.gz: 6d60b8c1b4acf024f50b8c363b905c8d167f503c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cf88297902defb328c85d39233af7a8f11fc20935c193a4a9cc568d38c844f3657973ea58425db4d719b6faa943df7edc71e179e06425ef39911cadf5599d21
|
7
|
+
data.tar.gz: 47e08167bb5129ef665310c2d332c57ae071d9755a2c5f133b03b158d1ae1520126ef1ab8ff58c2f70564bd93ac6ccac0a613e2eae49e8ca657d63bfe6024814
|
data/README.md
CHANGED
@@ -12,25 +12,25 @@ module Kabutops
|
|
12
12
|
|
13
13
|
callbacks :after_save, :save_if
|
14
14
|
|
15
|
-
def data &block
|
16
|
-
@recipe = Recipe.new
|
15
|
+
def data params={}, &block
|
16
|
+
@recipe = Recipe.new(params)
|
17
17
|
@recipe.instance_eval &block
|
18
18
|
end
|
19
19
|
|
20
20
|
def process resource, page
|
21
21
|
raise 'data block not defined' unless @recipe
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
[@recipe.process(resource, page)].flatten.each do |result|
|
24
|
+
result.update(updated_at: Time.now.to_i)
|
25
|
+
save = (notify(:save_if, resource, page, result) || []).all?
|
26
|
+
|
27
|
+
if debug
|
28
|
+
logger.info("#{self.class.to_s} outputs:")
|
29
|
+
logger.info(save ? result.to_hash : 'not valid for save')
|
30
|
+
elsif save
|
31
|
+
store(result)
|
32
|
+
notify(:after_save, result)
|
33
|
+
end
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Kabutops
|
|
17
17
|
|
18
18
|
params :collection, :proxy, :cache, :wait,
|
19
19
|
:skip_existing, :agent
|
20
|
-
callbacks :after_crawl, :before_cache
|
20
|
+
callbacks :after_crawl, :before_cache, :store_if
|
21
21
|
|
22
22
|
def adapters
|
23
23
|
@adapters ||= []
|
@@ -77,6 +77,10 @@ module Kabutops
|
|
77
77
|
|
78
78
|
return if page.nil?
|
79
79
|
|
80
|
+
save = (self.class.notify(:store_if, resource, page) || []).all?
|
81
|
+
|
82
|
+
return unless save
|
83
|
+
|
80
84
|
adapters.each do |adapter|
|
81
85
|
adapter.process(resource, page)
|
82
86
|
end
|
data/lib/kabutops/recipe.rb
CHANGED
@@ -5,7 +5,8 @@ module Kabutops
|
|
5
5
|
class Recipe
|
6
6
|
attr_reader :items
|
7
7
|
|
8
|
-
def initialize
|
8
|
+
def initialize params={}
|
9
|
+
@params = Hashie::Mash.new(params)
|
9
10
|
@items = Hashie::Mash.new
|
10
11
|
@nested = false
|
11
12
|
end
|
@@ -23,10 +24,20 @@ module Kabutops
|
|
23
24
|
end
|
24
25
|
|
25
26
|
def process resource, page
|
27
|
+
if @params[:each]
|
28
|
+
page.xpath(@params[:each]).map{ |n| process_one(resource, n) }
|
29
|
+
elsif @params[:each_css]
|
30
|
+
page.css(@params[:each_css]).map{ |n| process_one(resource, n) }
|
31
|
+
else
|
32
|
+
process_one(resource, page)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def process_one resource, node
|
26
37
|
result = Hashie::Mash.new
|
27
38
|
|
28
39
|
@items.each do |name, item|
|
29
|
-
result[name] = item.process(resource,
|
40
|
+
result[name] = item.process(resource, node)
|
30
41
|
end
|
31
42
|
|
32
43
|
result
|
data/lib/kabutops/spider.rb
CHANGED
@@ -18,6 +18,11 @@ module Kabutops
|
|
18
18
|
super(collection || [{ url: params.url, }])
|
19
19
|
end
|
20
20
|
|
21
|
+
def reset!
|
22
|
+
super
|
23
|
+
redis.keys.each{ |k| redis.del(k) }
|
24
|
+
end
|
25
|
+
|
21
26
|
def << resource
|
22
27
|
if resource_status(resource).nil?
|
23
28
|
resource_status(resource, 'new')
|
@@ -25,6 +30,12 @@ module Kabutops
|
|
25
30
|
end
|
26
31
|
end
|
27
32
|
|
33
|
+
def follow link
|
34
|
+
self << {
|
35
|
+
url: URI.join(params.url, URI.escape(link)).to_s
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
28
39
|
def resource_status resource, status=nil
|
29
40
|
url_status(resource[:url], status)
|
30
41
|
end
|
@@ -73,12 +84,10 @@ module Kabutops
|
|
73
84
|
|
74
85
|
def after_crawl resource, page
|
75
86
|
page.css('a').each do |a|
|
87
|
+
next if a['href'].nil?
|
88
|
+
|
76
89
|
follow = self.class.notify(:follow_if, a['href']).any?
|
77
|
-
if follow
|
78
|
-
self << {
|
79
|
-
url: URI.join(params.url, URI.escape(a['href'])).to_s
|
80
|
-
}
|
81
|
-
end
|
90
|
+
self.class.follow(a['href']) if follow
|
82
91
|
end
|
83
92
|
end
|
84
93
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|