kabutops 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -5
- data/lib/kabutops.rb +1 -3
- data/lib/kabutops/configuration.rb +5 -0
- data/lib/kabutops/crawler.rb +24 -20
- data/lib/kabutops/spider.rb +82 -0
- data/lib/kabutops/version.rb +1 -1
- metadata +3 -18
- data/lib/kabutops/adapters/sequel.rb +0 -47
- data/lib/kabutops/crawler_extensions/sequel.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
|
4
|
+
data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
|
7
|
+
data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
|
data/README.md
CHANGED
@@ -8,7 +8,6 @@ With Kabutops you can save data easily to:
|
|
8
8
|
* ElasticSearch
|
9
9
|
* MongoDB
|
10
10
|
* Redis
|
11
|
-
* SQL Databases (via Sequel)
|
12
11
|
|
13
12
|
Example for every kind of database are located
|
14
13
|
in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
|
@@ -25,9 +24,11 @@ gem install kabutops
|
|
25
24
|
Or you can put it in your Gemfile
|
26
25
|
|
27
26
|
```ruby
|
28
|
-
gem 'kabutops', '~> 0.0.
|
27
|
+
gem 'kabutops', '~> 0.0.11'
|
29
28
|
```
|
30
29
|
|
30
|
+
You will also need Redis database installed and running.
|
31
|
+
|
31
32
|
Basic example
|
32
33
|
-------------
|
33
34
|
|
@@ -199,12 +200,28 @@ class MyCrawler < Kabutops::Crawler
|
|
199
200
|
end
|
200
201
|
```
|
201
202
|
|
203
|
+
Javascript heavy site
|
204
|
+
---------------------
|
205
|
+
|
206
|
+
Crawling this kind of sites can be achieved by using non-default agent
|
207
|
+
(default is Mechanize.new).
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
class MyCrawler < Kabutops::Crawler
|
211
|
+
...
|
212
|
+
agent Bogeyman::Client.new
|
213
|
+
...
|
214
|
+
end
|
215
|
+
```
|
216
|
+
|
217
|
+
[Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
|
218
|
+
is wrapper build upon Phantomjs.
|
219
|
+
|
202
220
|
TODO
|
203
221
|
----
|
204
222
|
|
205
|
-
* Watchdog for Mongo
|
206
|
-
* skip_existing for Mongo, Redis
|
207
|
-
* Spider
|
223
|
+
* Watchdog for Mongo
|
224
|
+
* skip_existing feature for Mongo, Redis
|
208
225
|
|
209
226
|
License
|
210
227
|
-------
|
data/lib/kabutops.rb
CHANGED
@@ -11,7 +11,6 @@ require 'elasticsearch'
|
|
11
11
|
require 'redis'
|
12
12
|
require 'redis-namespace'
|
13
13
|
require 'mongo'
|
14
|
-
require 'sequel'
|
15
14
|
require 'mysql2'
|
16
15
|
require 'logger'
|
17
16
|
|
@@ -30,12 +29,11 @@ require 'kabutops/adapters/database_adapter'
|
|
30
29
|
require 'kabutops/adapters/elastic_search'
|
31
30
|
require 'kabutops/adapters/redis'
|
32
31
|
require 'kabutops/adapters/mongo'
|
33
|
-
require 'kabutops/adapters/sequel'
|
34
32
|
require 'kabutops/crawler_extensions/elastic_search'
|
35
33
|
require 'kabutops/crawler_extensions/redis'
|
36
34
|
require 'kabutops/crawler_extensions/mongo'
|
37
|
-
require 'kabutops/crawler_extensions/sequel'
|
38
35
|
require 'kabutops/crawler_extensions/pstore_storage'
|
39
36
|
require 'kabutops/crawler_extensions/debugging'
|
40
37
|
require 'kabutops/crawler'
|
41
38
|
require 'kabutops/watchdog'
|
39
|
+
require 'kabutops/spider'
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -9,14 +9,14 @@ module Kabutops
|
|
9
9
|
include CrawlerExtensions::ElasticSearch
|
10
10
|
include CrawlerExtensions::Redis
|
11
11
|
include CrawlerExtensions::Mongo
|
12
|
-
include CrawlerExtensions::Sequel
|
13
12
|
include Sidekiq::Worker
|
14
13
|
|
15
14
|
class << self
|
16
15
|
include Extensions::Parameterable
|
17
16
|
include Extensions::CallbackSupport
|
18
17
|
|
19
|
-
params :collection, :proxy, :cache, :wait,
|
18
|
+
params :collection, :proxy, :cache, :wait,
|
19
|
+
:skip_existing, :agent
|
20
20
|
callbacks :after_crawl
|
21
21
|
|
22
22
|
def adapters
|
@@ -62,29 +62,29 @@ module Kabutops
|
|
62
62
|
|
63
63
|
def perform resource
|
64
64
|
resource = Hashie::Mash.new(resource)
|
65
|
-
adapters = self.class.adapters
|
66
|
-
|
67
|
-
if self.class.params.skip_existing
|
68
|
-
adapters = self.class.adapters.select do |adapter|
|
69
|
-
if adapter.respond_to? :find
|
70
|
-
adapter.find(resource).nil?
|
71
|
-
else
|
72
|
-
true
|
73
|
-
end
|
74
|
-
end
|
75
65
|
|
76
|
-
|
66
|
+
adapters = self.class.adapters.select do |adapter|
|
67
|
+
if params.skip_existing && adapter.respond_to?(:find)
|
68
|
+
adapter.find(resource).nil?
|
69
|
+
else
|
70
|
+
true
|
71
|
+
end
|
77
72
|
end
|
78
73
|
|
74
|
+
return if adapters.nil?
|
75
|
+
|
79
76
|
page = crawl(resource)
|
80
77
|
|
81
78
|
adapters.each do |adapter|
|
82
79
|
adapter.process(resource, page)
|
83
80
|
end
|
84
81
|
rescue Exception => e
|
85
|
-
|
86
|
-
|
87
|
-
|
82
|
+
unless self.class.debug
|
83
|
+
logger.error(e.message)
|
84
|
+
logger.error(e.backtrace.join("\n"))
|
85
|
+
end
|
86
|
+
|
87
|
+
sleep params[:wait] || 0
|
88
88
|
raise e
|
89
89
|
end
|
90
90
|
|
@@ -94,10 +94,14 @@ module Kabutops
|
|
94
94
|
|
95
95
|
protected
|
96
96
|
|
97
|
+
def params
|
98
|
+
self.class.params
|
99
|
+
end
|
100
|
+
|
97
101
|
def crawl resource
|
98
102
|
cache_key = (resource[:id] || resource[:url]).to_s
|
99
|
-
content = Cachy.cache_if(
|
100
|
-
sleep
|
103
|
+
content = Cachy.cache_if(params.cache, cache_key) do
|
104
|
+
sleep params[:wait] || 0 # wait only if value is not from cache
|
101
105
|
agent.get(resource[:url]).body
|
102
106
|
end
|
103
107
|
|
@@ -108,8 +112,8 @@ module Kabutops
|
|
108
112
|
|
109
113
|
def agent
|
110
114
|
unless @agent
|
111
|
-
@agent = Mechanize.new
|
112
|
-
@agent.set_proxy(*
|
115
|
+
@agent = params[:agent] || Mechanize.new
|
116
|
+
@agent.set_proxy(*params[:proxy]) if params[:proxy]
|
113
117
|
end
|
114
118
|
|
115
119
|
@agent
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module Kabutops
|
4
|
+
|
5
|
+
class Spider < Crawler
|
6
|
+
class << self
|
7
|
+
params :url
|
8
|
+
callbacks :after_crawl, :follow_if
|
9
|
+
|
10
|
+
def debug_spider
|
11
|
+
enable_debug
|
12
|
+
self.new.perform({
|
13
|
+
url: params[:url]
|
14
|
+
})
|
15
|
+
end
|
16
|
+
|
17
|
+
def << resource
|
18
|
+
if resource_status(resource).nil?
|
19
|
+
resource_status(resource, 'new')
|
20
|
+
super
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def resource_status resource, status=nil
|
25
|
+
url_status(resource[:url], status)
|
26
|
+
end
|
27
|
+
|
28
|
+
def url_status url, status=nil
|
29
|
+
key = redis_key(url)
|
30
|
+
|
31
|
+
if status
|
32
|
+
redis.set(
|
33
|
+
key,
|
34
|
+
JSON.dump({
|
35
|
+
url: url,
|
36
|
+
status: status,
|
37
|
+
})
|
38
|
+
)
|
39
|
+
else
|
40
|
+
item = redis.get(key)
|
41
|
+
item ? JSON.parse(item)['status'] : nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
protected
|
46
|
+
|
47
|
+
def redis_key string
|
48
|
+
Digest::SHA256.hexdigest(string)
|
49
|
+
end
|
50
|
+
|
51
|
+
def redis
|
52
|
+
@redis ||= ::Redis::Namespace.new(
|
53
|
+
self.to_s,
|
54
|
+
redis: ::Redis.new(
|
55
|
+
host: Configuration[:redis][:host],
|
56
|
+
port: Configuration[:redis][:port],
|
57
|
+
db: Configuration[:redis][:db],
|
58
|
+
)
|
59
|
+
)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def crawl resource
|
64
|
+
page = super
|
65
|
+
after_crawl(resource, page)
|
66
|
+
self.class.resource_status(resource, 'done')
|
67
|
+
page
|
68
|
+
end
|
69
|
+
|
70
|
+
def after_crawl resource, page
|
71
|
+
page.css('a').each do |a|
|
72
|
+
follow = self.class.notify(:follow_if, a['href']).any?
|
73
|
+
if follow
|
74
|
+
self << {
|
75
|
+
url: a['href'],
|
76
|
+
}
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -150,20 +150,6 @@ dependencies:
|
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '1.10'
|
153
|
-
- !ruby/object:Gem::Dependency
|
154
|
-
name: sequel
|
155
|
-
requirement: !ruby/object:Gem::Requirement
|
156
|
-
requirements:
|
157
|
-
- - "~>"
|
158
|
-
- !ruby/object:Gem::Version
|
159
|
-
version: '4.11'
|
160
|
-
type: :runtime
|
161
|
-
prerelease: false
|
162
|
-
version_requirements: !ruby/object:Gem::Requirement
|
163
|
-
requirements:
|
164
|
-
- - "~>"
|
165
|
-
- !ruby/object:Gem::Version
|
166
|
-
version: '4.11'
|
167
153
|
- !ruby/object:Gem::Dependency
|
168
154
|
name: bson_ext
|
169
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -235,7 +221,6 @@ files:
|
|
235
221
|
- lib/kabutops/adapters/elastic_search.rb
|
236
222
|
- lib/kabutops/adapters/mongo.rb
|
237
223
|
- lib/kabutops/adapters/redis.rb
|
238
|
-
- lib/kabutops/adapters/sequel.rb
|
239
224
|
- lib/kabutops/configuration.rb
|
240
225
|
- lib/kabutops/crawler.rb
|
241
226
|
- lib/kabutops/crawler_extensions/debugging.rb
|
@@ -243,13 +228,13 @@ files:
|
|
243
228
|
- lib/kabutops/crawler_extensions/mongo.rb
|
244
229
|
- lib/kabutops/crawler_extensions/pstore_storage.rb
|
245
230
|
- lib/kabutops/crawler_extensions/redis.rb
|
246
|
-
- lib/kabutops/crawler_extensions/sequel.rb
|
247
231
|
- lib/kabutops/extensions/callback_support.rb
|
248
232
|
- lib/kabutops/extensions/includable.rb
|
249
233
|
- lib/kabutops/extensions/logging.rb
|
250
234
|
- lib/kabutops/extensions/parameterable.rb
|
251
235
|
- lib/kabutops/recipe.rb
|
252
236
|
- lib/kabutops/recipe_item.rb
|
237
|
+
- lib/kabutops/spider.rb
|
253
238
|
- lib/kabutops/version.rb
|
254
239
|
- lib/kabutops/watchdog.rb
|
255
240
|
homepage: https://github.com/reneklacan/kabutops
|
@@ -1,47 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module Adapters
|
6
|
-
|
7
|
-
class Sequel < DatabaseAdapter
|
8
|
-
include Extensions::Parameterable
|
9
|
-
|
10
|
-
params :connect, :type, :host, :port,
|
11
|
-
:db, :user, :password, :table
|
12
|
-
|
13
|
-
def store result
|
14
|
-
client_table.insert(result)
|
15
|
-
end
|
16
|
-
|
17
|
-
def nested?
|
18
|
-
false
|
19
|
-
end
|
20
|
-
|
21
|
-
protected
|
22
|
-
|
23
|
-
def client
|
24
|
-
return @@client if defined?(@@client)
|
25
|
-
|
26
|
-
if params[:connect]
|
27
|
-
@@client ||= ::Sequel.connect(params[:connect])
|
28
|
-
else
|
29
|
-
@@client ||= ::Sequel.connect(
|
30
|
-
adapter: params[:type] || 'mysql2',
|
31
|
-
user: params[:user] || 'root',
|
32
|
-
password: params[:password] || 'root',
|
33
|
-
host: params[:host] || 'localhost',
|
34
|
-
port: params[:port] || 3306,
|
35
|
-
database: params[:db] || params[:database] || 'kabutops',
|
36
|
-
)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def client_table
|
41
|
-
@@client_table ||= client[params[:table]]
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module CrawlerExtensions
|
6
|
-
|
7
|
-
module Sequel
|
8
|
-
extend Extensions::Includable
|
9
|
-
|
10
|
-
module ClassMethods
|
11
|
-
def sequel &block
|
12
|
-
adapters << Adapters::Sequel.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|