kabutops 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41d09fa8638097298ed1c230a255086949651188
4
- data.tar.gz: 3b9c0f19b9272332c940ac649b184c84e02740b2
3
+ metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
4
+ data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
5
5
  SHA512:
6
- metadata.gz: fbdb727ea93bf2048de9a0773a202d7dedcafab67f2f12e378bed53557d9b1b1855a0af45dbf877f68d69d618fc91fc12c9143e3ae8a339aa0a30636b64e1d73
7
- data.tar.gz: 26ad39464ae24c5082d94b043ce4162e2de70aa025a315d3ba3d2d936325dfa200429dc5e4763f4f43b1659dbac9503d7e1c5a9f9b6e9e4cb1f4709aba1226d9
6
+ metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
7
+ data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
data/README.md CHANGED
@@ -8,7 +8,6 @@ With Kabutops you can save data easily to:
8
8
  * ElasticSearch
9
9
  * MongoDB
10
10
  * Redis
11
- * SQL Databases (via Sequel)
12
11
 
13
12
  Example for every kind of database are located
14
13
  in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
@@ -25,9 +24,11 @@ gem install kabutops
25
24
  Or you can put it in your Gemfile
26
25
 
27
26
  ```ruby
28
- gem 'kabutops', '~> 0.0.10'
27
+ gem 'kabutops', '~> 0.0.11'
29
28
  ```
30
29
 
30
+ You will also need Redis database installed and running.
31
+
31
32
  Basic example
32
33
  -------------
33
34
 
@@ -199,12 +200,28 @@ class MyCrawler < Kabutops::Crawler
199
200
  end
200
201
  ```
201
202
 
203
+ Javascript heavy site
204
+ ---------------------
205
+
206
+ Crawling this kind of sites can be achieved by using non-default agent
207
+ (default is Mechanize.new).
208
+
209
+ ```ruby
210
+ class MyCrawler < Kabutops::Crawler
211
+ ...
212
+ agent Bogeyman::Client.new
213
+ ...
214
+ end
215
+ ```
216
+
217
+ [Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
218
+ is wrapper build upon Phantomjs.
219
+
202
220
  TODO
203
221
  ----
204
222
 
205
- * Watchdog for Mongo and Sequel
206
- * skip_existing for Mongo, Redis and Sequel
207
- * Spider
223
+ * Watchdog for Mongo
224
+ * skip_existing feature for Mongo, Redis
208
225
 
209
226
  License
210
227
  -------
@@ -11,7 +11,6 @@ require 'elasticsearch'
11
11
  require 'redis'
12
12
  require 'redis-namespace'
13
13
  require 'mongo'
14
- require 'sequel'
15
14
  require 'mysql2'
16
15
  require 'logger'
17
16
 
@@ -30,12 +29,11 @@ require 'kabutops/adapters/database_adapter'
30
29
  require 'kabutops/adapters/elastic_search'
31
30
  require 'kabutops/adapters/redis'
32
31
  require 'kabutops/adapters/mongo'
33
- require 'kabutops/adapters/sequel'
34
32
  require 'kabutops/crawler_extensions/elastic_search'
35
33
  require 'kabutops/crawler_extensions/redis'
36
34
  require 'kabutops/crawler_extensions/mongo'
37
- require 'kabutops/crawler_extensions/sequel'
38
35
  require 'kabutops/crawler_extensions/pstore_storage'
39
36
  require 'kabutops/crawler_extensions/debugging'
40
37
  require 'kabutops/crawler'
41
38
  require 'kabutops/watchdog'
39
+ require 'kabutops/spider'
@@ -24,6 +24,11 @@ module Kabutops
24
24
  dev: STDOUT,
25
25
  level: Logger::DEBUG
26
26
  },
27
+ redis: {
28
+ host: 'localhost',
29
+ port: 6379,
30
+ db: 0
31
+ },
27
32
  )
28
33
  end
29
34
  end
@@ -9,14 +9,14 @@ module Kabutops
9
9
  include CrawlerExtensions::ElasticSearch
10
10
  include CrawlerExtensions::Redis
11
11
  include CrawlerExtensions::Mongo
12
- include CrawlerExtensions::Sequel
13
12
  include Sidekiq::Worker
14
13
 
15
14
  class << self
16
15
  include Extensions::Parameterable
17
16
  include Extensions::CallbackSupport
18
17
 
19
- params :collection, :proxy, :cache, :wait, :skip_existing
18
+ params :collection, :proxy, :cache, :wait,
19
+ :skip_existing, :agent
20
20
  callbacks :after_crawl
21
21
 
22
22
  def adapters
@@ -62,29 +62,29 @@ module Kabutops
62
62
 
63
63
  def perform resource
64
64
  resource = Hashie::Mash.new(resource)
65
- adapters = self.class.adapters
66
-
67
- if self.class.params.skip_existing
68
- adapters = self.class.adapters.select do |adapter|
69
- if adapter.respond_to? :find
70
- adapter.find(resource).nil?
71
- else
72
- true
73
- end
74
- end
75
65
 
76
- return if adapters.nil?
66
+ adapters = self.class.adapters.select do |adapter|
67
+ if params.skip_existing && adapter.respond_to?(:find)
68
+ adapter.find(resource).nil?
69
+ else
70
+ true
71
+ end
77
72
  end
78
73
 
74
+ return if adapters.nil?
75
+
79
76
  page = crawl(resource)
80
77
 
81
78
  adapters.each do |adapter|
82
79
  adapter.process(resource, page)
83
80
  end
84
81
  rescue Exception => e
85
- logger.error(e.message)
86
- logger.error(e.backtrace.join("\n"))
87
- sleep self.class.params[:wait] || 0
82
+ unless self.class.debug
83
+ logger.error(e.message)
84
+ logger.error(e.backtrace.join("\n"))
85
+ end
86
+
87
+ sleep params[:wait] || 0
88
88
  raise e
89
89
  end
90
90
 
@@ -94,10 +94,14 @@ module Kabutops
94
94
 
95
95
  protected
96
96
 
97
+ def params
98
+ self.class.params
99
+ end
100
+
97
101
  def crawl resource
98
102
  cache_key = (resource[:id] || resource[:url]).to_s
99
- content = Cachy.cache_if(self.class.params.cache, cache_key) do
100
- sleep self.class.params[:wait] || 0 # wait only if value is not from cache
103
+ content = Cachy.cache_if(params.cache, cache_key) do
104
+ sleep params[:wait] || 0 # wait only if value is not from cache
101
105
  agent.get(resource[:url]).body
102
106
  end
103
107
 
@@ -108,8 +112,8 @@ module Kabutops
108
112
 
109
113
  def agent
110
114
  unless @agent
111
- @agent = Mechanize.new
112
- @agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
115
+ @agent = params[:agent] || Mechanize.new
116
+ @agent.set_proxy(*params[:proxy]) if params[:proxy]
113
117
  end
114
118
 
115
119
  @agent
@@ -0,0 +1,82 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ class Spider < Crawler
6
+ class << self
7
+ params :url
8
+ callbacks :after_crawl, :follow_if
9
+
10
+ def debug_spider
11
+ enable_debug
12
+ self.new.perform({
13
+ url: params[:url]
14
+ })
15
+ end
16
+
17
+ def << resource
18
+ if resource_status(resource).nil?
19
+ resource_status(resource, 'new')
20
+ super
21
+ end
22
+ end
23
+
24
+ def resource_status resource, status=nil
25
+ url_status(resource[:url], status)
26
+ end
27
+
28
+ def url_status url, status=nil
29
+ key = redis_key(url)
30
+
31
+ if status
32
+ redis.set(
33
+ key,
34
+ JSON.dump({
35
+ url: url,
36
+ status: status,
37
+ })
38
+ )
39
+ else
40
+ item = redis.get(key)
41
+ item ? JSON.parse(item)['status'] : nil
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def redis_key string
48
+ Digest::SHA256.hexdigest(string)
49
+ end
50
+
51
+ def redis
52
+ @redis ||= ::Redis::Namespace.new(
53
+ self.to_s,
54
+ redis: ::Redis.new(
55
+ host: Configuration[:redis][:host],
56
+ port: Configuration[:redis][:port],
57
+ db: Configuration[:redis][:db],
58
+ )
59
+ )
60
+ end
61
+ end
62
+
63
+ def crawl resource
64
+ page = super
65
+ after_crawl(resource, page)
66
+ self.class.resource_status(resource, 'done')
67
+ page
68
+ end
69
+
70
+ def after_crawl resource, page
71
+ page.css('a').each do |a|
72
+ follow = self.class.notify(:follow_if, a['href']).any?
73
+ if follow
74
+ self << {
75
+ url: a['href'],
76
+ }
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.0.10'
4
+ VERSION = '0.0.11'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-07 00:00:00.000000000 Z
11
+ date: 2014-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -150,20 +150,6 @@ dependencies:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
152
  version: '1.10'
153
- - !ruby/object:Gem::Dependency
154
- name: sequel
155
- requirement: !ruby/object:Gem::Requirement
156
- requirements:
157
- - - "~>"
158
- - !ruby/object:Gem::Version
159
- version: '4.11'
160
- type: :runtime
161
- prerelease: false
162
- version_requirements: !ruby/object:Gem::Requirement
163
- requirements:
164
- - - "~>"
165
- - !ruby/object:Gem::Version
166
- version: '4.11'
167
153
  - !ruby/object:Gem::Dependency
168
154
  name: bson_ext
169
155
  requirement: !ruby/object:Gem::Requirement
@@ -235,7 +221,6 @@ files:
235
221
  - lib/kabutops/adapters/elastic_search.rb
236
222
  - lib/kabutops/adapters/mongo.rb
237
223
  - lib/kabutops/adapters/redis.rb
238
- - lib/kabutops/adapters/sequel.rb
239
224
  - lib/kabutops/configuration.rb
240
225
  - lib/kabutops/crawler.rb
241
226
  - lib/kabutops/crawler_extensions/debugging.rb
@@ -243,13 +228,13 @@ files:
243
228
  - lib/kabutops/crawler_extensions/mongo.rb
244
229
  - lib/kabutops/crawler_extensions/pstore_storage.rb
245
230
  - lib/kabutops/crawler_extensions/redis.rb
246
- - lib/kabutops/crawler_extensions/sequel.rb
247
231
  - lib/kabutops/extensions/callback_support.rb
248
232
  - lib/kabutops/extensions/includable.rb
249
233
  - lib/kabutops/extensions/logging.rb
250
234
  - lib/kabutops/extensions/parameterable.rb
251
235
  - lib/kabutops/recipe.rb
252
236
  - lib/kabutops/recipe_item.rb
237
+ - lib/kabutops/spider.rb
253
238
  - lib/kabutops/version.rb
254
239
  - lib/kabutops/watchdog.rb
255
240
  homepage: https://github.com/reneklacan/kabutops
@@ -1,47 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- module Adapters
6
-
7
- class Sequel < DatabaseAdapter
8
- include Extensions::Parameterable
9
-
10
- params :connect, :type, :host, :port,
11
- :db, :user, :password, :table
12
-
13
- def store result
14
- client_table.insert(result)
15
- end
16
-
17
- def nested?
18
- false
19
- end
20
-
21
- protected
22
-
23
- def client
24
- return @@client if defined?(@@client)
25
-
26
- if params[:connect]
27
- @@client ||= ::Sequel.connect(params[:connect])
28
- else
29
- @@client ||= ::Sequel.connect(
30
- adapter: params[:type] || 'mysql2',
31
- user: params[:user] || 'root',
32
- password: params[:password] || 'root',
33
- host: params[:host] || 'localhost',
34
- port: params[:port] || 3306,
35
- database: params[:db] || params[:database] || 'kabutops',
36
- )
37
- end
38
- end
39
-
40
- def client_table
41
- @@client_table ||= client[params[:table]]
42
- end
43
- end
44
-
45
- end
46
-
47
- end
@@ -1,19 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- module CrawlerExtensions
6
-
7
- module Sequel
8
- extend Extensions::Includable
9
-
10
- module ClassMethods
11
- def sequel &block
12
- adapters << Adapters::Sequel.new(&block)
13
- end
14
- end
15
- end
16
-
17
- end
18
-
19
- end