kabutops 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +22 -5
- data/lib/kabutops.rb +1 -3
- data/lib/kabutops/configuration.rb +5 -0
- data/lib/kabutops/crawler.rb +24 -20
- data/lib/kabutops/spider.rb +82 -0
- data/lib/kabutops/version.rb +1 -1
- metadata +3 -18
- data/lib/kabutops/adapters/sequel.rb +0 -47
- data/lib/kabutops/crawler_extensions/sequel.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
|
4
|
+
data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
|
7
|
+
data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
|
data/README.md
CHANGED
@@ -8,7 +8,6 @@ With Kabutops you can save data easily to:
|
|
8
8
|
* ElasticSearch
|
9
9
|
* MongoDB
|
10
10
|
* Redis
|
11
|
-
* SQL Databases (via Sequel)
|
12
11
|
|
13
12
|
Example for every kind of database are located
|
14
13
|
in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
|
@@ -25,9 +24,11 @@ gem install kabutops
|
|
25
24
|
Or you can put it in your Gemfile
|
26
25
|
|
27
26
|
```ruby
|
28
|
-
gem 'kabutops', '~> 0.0.
|
27
|
+
gem 'kabutops', '~> 0.0.11'
|
29
28
|
```
|
30
29
|
|
30
|
+
You will also need Redis database installed and running.
|
31
|
+
|
31
32
|
Basic example
|
32
33
|
-------------
|
33
34
|
|
@@ -199,12 +200,28 @@ class MyCrawler < Kabutops::Crawler
|
|
199
200
|
end
|
200
201
|
```
|
201
202
|
|
203
|
+
Javascript heavy site
|
204
|
+
---------------------
|
205
|
+
|
206
|
+
Crawling this kind of sites can be achieved by using non-default agent
|
207
|
+
(default is Mechanize.new).
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
class MyCrawler < Kabutops::Crawler
|
211
|
+
...
|
212
|
+
agent Bogeyman::Client.new
|
213
|
+
...
|
214
|
+
end
|
215
|
+
```
|
216
|
+
|
217
|
+
[Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
|
218
|
+
is wrapper build upon Phantomjs.
|
219
|
+
|
202
220
|
TODO
|
203
221
|
----
|
204
222
|
|
205
|
-
* Watchdog for Mongo
|
206
|
-
* skip_existing for Mongo, Redis
|
207
|
-
* Spider
|
223
|
+
* Watchdog for Mongo
|
224
|
+
* skip_existing feature for Mongo, Redis
|
208
225
|
|
209
226
|
License
|
210
227
|
-------
|
data/lib/kabutops.rb
CHANGED
@@ -11,7 +11,6 @@ require 'elasticsearch'
|
|
11
11
|
require 'redis'
|
12
12
|
require 'redis-namespace'
|
13
13
|
require 'mongo'
|
14
|
-
require 'sequel'
|
15
14
|
require 'mysql2'
|
16
15
|
require 'logger'
|
17
16
|
|
@@ -30,12 +29,11 @@ require 'kabutops/adapters/database_adapter'
|
|
30
29
|
require 'kabutops/adapters/elastic_search'
|
31
30
|
require 'kabutops/adapters/redis'
|
32
31
|
require 'kabutops/adapters/mongo'
|
33
|
-
require 'kabutops/adapters/sequel'
|
34
32
|
require 'kabutops/crawler_extensions/elastic_search'
|
35
33
|
require 'kabutops/crawler_extensions/redis'
|
36
34
|
require 'kabutops/crawler_extensions/mongo'
|
37
|
-
require 'kabutops/crawler_extensions/sequel'
|
38
35
|
require 'kabutops/crawler_extensions/pstore_storage'
|
39
36
|
require 'kabutops/crawler_extensions/debugging'
|
40
37
|
require 'kabutops/crawler'
|
41
38
|
require 'kabutops/watchdog'
|
39
|
+
require 'kabutops/spider'
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -9,14 +9,14 @@ module Kabutops
|
|
9
9
|
include CrawlerExtensions::ElasticSearch
|
10
10
|
include CrawlerExtensions::Redis
|
11
11
|
include CrawlerExtensions::Mongo
|
12
|
-
include CrawlerExtensions::Sequel
|
13
12
|
include Sidekiq::Worker
|
14
13
|
|
15
14
|
class << self
|
16
15
|
include Extensions::Parameterable
|
17
16
|
include Extensions::CallbackSupport
|
18
17
|
|
19
|
-
params :collection, :proxy, :cache, :wait,
|
18
|
+
params :collection, :proxy, :cache, :wait,
|
19
|
+
:skip_existing, :agent
|
20
20
|
callbacks :after_crawl
|
21
21
|
|
22
22
|
def adapters
|
@@ -62,29 +62,29 @@ module Kabutops
|
|
62
62
|
|
63
63
|
def perform resource
|
64
64
|
resource = Hashie::Mash.new(resource)
|
65
|
-
adapters = self.class.adapters
|
66
|
-
|
67
|
-
if self.class.params.skip_existing
|
68
|
-
adapters = self.class.adapters.select do |adapter|
|
69
|
-
if adapter.respond_to? :find
|
70
|
-
adapter.find(resource).nil?
|
71
|
-
else
|
72
|
-
true
|
73
|
-
end
|
74
|
-
end
|
75
65
|
|
76
|
-
|
66
|
+
adapters = self.class.adapters.select do |adapter|
|
67
|
+
if params.skip_existing && adapter.respond_to?(:find)
|
68
|
+
adapter.find(resource).nil?
|
69
|
+
else
|
70
|
+
true
|
71
|
+
end
|
77
72
|
end
|
78
73
|
|
74
|
+
return if adapters.nil?
|
75
|
+
|
79
76
|
page = crawl(resource)
|
80
77
|
|
81
78
|
adapters.each do |adapter|
|
82
79
|
adapter.process(resource, page)
|
83
80
|
end
|
84
81
|
rescue Exception => e
|
85
|
-
|
86
|
-
|
87
|
-
|
82
|
+
unless self.class.debug
|
83
|
+
logger.error(e.message)
|
84
|
+
logger.error(e.backtrace.join("\n"))
|
85
|
+
end
|
86
|
+
|
87
|
+
sleep params[:wait] || 0
|
88
88
|
raise e
|
89
89
|
end
|
90
90
|
|
@@ -94,10 +94,14 @@ module Kabutops
|
|
94
94
|
|
95
95
|
protected
|
96
96
|
|
97
|
+
def params
|
98
|
+
self.class.params
|
99
|
+
end
|
100
|
+
|
97
101
|
def crawl resource
|
98
102
|
cache_key = (resource[:id] || resource[:url]).to_s
|
99
|
-
content = Cachy.cache_if(
|
100
|
-
sleep
|
103
|
+
content = Cachy.cache_if(params.cache, cache_key) do
|
104
|
+
sleep params[:wait] || 0 # wait only if value is not from cache
|
101
105
|
agent.get(resource[:url]).body
|
102
106
|
end
|
103
107
|
|
@@ -108,8 +112,8 @@ module Kabutops
|
|
108
112
|
|
109
113
|
def agent
|
110
114
|
unless @agent
|
111
|
-
@agent = Mechanize.new
|
112
|
-
@agent.set_proxy(*
|
115
|
+
@agent = params[:agent] || Mechanize.new
|
116
|
+
@agent.set_proxy(*params[:proxy]) if params[:proxy]
|
113
117
|
end
|
114
118
|
|
115
119
|
@agent
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module Kabutops
|
4
|
+
|
5
|
+
class Spider < Crawler
|
6
|
+
class << self
|
7
|
+
params :url
|
8
|
+
callbacks :after_crawl, :follow_if
|
9
|
+
|
10
|
+
def debug_spider
|
11
|
+
enable_debug
|
12
|
+
self.new.perform({
|
13
|
+
url: params[:url]
|
14
|
+
})
|
15
|
+
end
|
16
|
+
|
17
|
+
def << resource
|
18
|
+
if resource_status(resource).nil?
|
19
|
+
resource_status(resource, 'new')
|
20
|
+
super
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def resource_status resource, status=nil
|
25
|
+
url_status(resource[:url], status)
|
26
|
+
end
|
27
|
+
|
28
|
+
def url_status url, status=nil
|
29
|
+
key = redis_key(url)
|
30
|
+
|
31
|
+
if status
|
32
|
+
redis.set(
|
33
|
+
key,
|
34
|
+
JSON.dump({
|
35
|
+
url: url,
|
36
|
+
status: status,
|
37
|
+
})
|
38
|
+
)
|
39
|
+
else
|
40
|
+
item = redis.get(key)
|
41
|
+
item ? JSON.parse(item)['status'] : nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def redis_key string
|
48
|
+
Digest::SHA256.hexdigest(string)
|
49
|
+
end
|
50
|
+
|
51
|
+
def redis
|
52
|
+
@redis ||= ::Redis::Namespace.new(
|
53
|
+
self.to_s,
|
54
|
+
redis: ::Redis.new(
|
55
|
+
host: Configuration[:redis][:host],
|
56
|
+
port: Configuration[:redis][:port],
|
57
|
+
db: Configuration[:redis][:db],
|
58
|
+
)
|
59
|
+
)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def crawl resource
|
64
|
+
page = super
|
65
|
+
after_crawl(resource, page)
|
66
|
+
self.class.resource_status(resource, 'done')
|
67
|
+
page
|
68
|
+
end
|
69
|
+
|
70
|
+
def after_crawl resource, page
|
71
|
+
page.css('a').each do |a|
|
72
|
+
follow = self.class.notify(:follow_if, a['href']).any?
|
73
|
+
if follow
|
74
|
+
self << {
|
75
|
+
url: a['href'],
|
76
|
+
}
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -150,20 +150,6 @@ dependencies:
|
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '1.10'
|
153
|
-
- !ruby/object:Gem::Dependency
|
154
|
-
name: sequel
|
155
|
-
requirement: !ruby/object:Gem::Requirement
|
156
|
-
requirements:
|
157
|
-
- - "~>"
|
158
|
-
- !ruby/object:Gem::Version
|
159
|
-
version: '4.11'
|
160
|
-
type: :runtime
|
161
|
-
prerelease: false
|
162
|
-
version_requirements: !ruby/object:Gem::Requirement
|
163
|
-
requirements:
|
164
|
-
- - "~>"
|
165
|
-
- !ruby/object:Gem::Version
|
166
|
-
version: '4.11'
|
167
153
|
- !ruby/object:Gem::Dependency
|
168
154
|
name: bson_ext
|
169
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -235,7 +221,6 @@ files:
|
|
235
221
|
- lib/kabutops/adapters/elastic_search.rb
|
236
222
|
- lib/kabutops/adapters/mongo.rb
|
237
223
|
- lib/kabutops/adapters/redis.rb
|
238
|
-
- lib/kabutops/adapters/sequel.rb
|
239
224
|
- lib/kabutops/configuration.rb
|
240
225
|
- lib/kabutops/crawler.rb
|
241
226
|
- lib/kabutops/crawler_extensions/debugging.rb
|
@@ -243,13 +228,13 @@ files:
|
|
243
228
|
- lib/kabutops/crawler_extensions/mongo.rb
|
244
229
|
- lib/kabutops/crawler_extensions/pstore_storage.rb
|
245
230
|
- lib/kabutops/crawler_extensions/redis.rb
|
246
|
-
- lib/kabutops/crawler_extensions/sequel.rb
|
247
231
|
- lib/kabutops/extensions/callback_support.rb
|
248
232
|
- lib/kabutops/extensions/includable.rb
|
249
233
|
- lib/kabutops/extensions/logging.rb
|
250
234
|
- lib/kabutops/extensions/parameterable.rb
|
251
235
|
- lib/kabutops/recipe.rb
|
252
236
|
- lib/kabutops/recipe_item.rb
|
237
|
+
- lib/kabutops/spider.rb
|
253
238
|
- lib/kabutops/version.rb
|
254
239
|
- lib/kabutops/watchdog.rb
|
255
240
|
homepage: https://github.com/reneklacan/kabutops
|
@@ -1,47 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module Adapters
|
6
|
-
|
7
|
-
class Sequel < DatabaseAdapter
|
8
|
-
include Extensions::Parameterable
|
9
|
-
|
10
|
-
params :connect, :type, :host, :port,
|
11
|
-
:db, :user, :password, :table
|
12
|
-
|
13
|
-
def store result
|
14
|
-
client_table.insert(result)
|
15
|
-
end
|
16
|
-
|
17
|
-
def nested?
|
18
|
-
false
|
19
|
-
end
|
20
|
-
|
21
|
-
protected
|
22
|
-
|
23
|
-
def client
|
24
|
-
return @@client if defined?(@@client)
|
25
|
-
|
26
|
-
if params[:connect]
|
27
|
-
@@client ||= ::Sequel.connect(params[:connect])
|
28
|
-
else
|
29
|
-
@@client ||= ::Sequel.connect(
|
30
|
-
adapter: params[:type] || 'mysql2',
|
31
|
-
user: params[:user] || 'root',
|
32
|
-
password: params[:password] || 'root',
|
33
|
-
host: params[:host] || 'localhost',
|
34
|
-
port: params[:port] || 3306,
|
35
|
-
database: params[:db] || params[:database] || 'kabutops',
|
36
|
-
)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def client_table
|
41
|
-
@@client_table ||= client[params[:table]]
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module CrawlerExtensions
|
6
|
-
|
7
|
-
module Sequel
|
8
|
-
extend Extensions::Includable
|
9
|
-
|
10
|
-
module ClassMethods
|
11
|
-
def sequel &block
|
12
|
-
adapters << Adapters::Sequel.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|