kabutops 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1312dd77260f2a2f113ba6e757e9da4491caf282
4
- data.tar.gz: 031d23cf3833bda39fdabe6759d22acf377ac7c3
3
+ metadata.gz: 23b94bb24325c0b57ed6fdbc05a94d87484e319f
4
+ data.tar.gz: 7175b6f9e4c0dfb1c3fad129a2574d4234e31ce2
5
5
  SHA512:
6
- metadata.gz: 313a091b905a439d0b194f1b468fdccc188a99ff43b40b470bf135e58880c787bf723b70930285f05dac043d5c2a37247ec4a2f494ed098eb11b072fa24ce4c6
7
- data.tar.gz: f8a3e80823d52b7e234625327d4522fd700e1296d52c699da072f31a0943c3ccbfa0f8bdaefbf40460e2bd434f7ea307ce891d4213660a5f18067a2899cf0d33
6
+ metadata.gz: 13e0e68d4f333526c7a9cfa23f5555df1dfa8defac859dc40317d13e5bd5bbbb981a7041ea3f0fb0845ff27ed908be23481f2e20621332c6186aaab384ef3fc1
7
+ data.tar.gz: b4081eaee098485fd3a92bb37e7d4b737359ec6cbdfcd3bb9131fecfee671134b3c149dee7dbf4afde34dfc3ab685636254762f33bc0b6e3d653364c7d852994
data/README.md CHANGED
@@ -1,6 +1,18 @@
1
1
  Kabutops [![Code Climate](https://codeclimate.com/github/reneklacan/kabutops.png)](https://codeclimate.com/github/reneklacan/kabutops) [![Coverage](https://codeclimate.com/github/reneklacan/kabutops/coverage.png)](https://codeclimate.com/github/reneklacan/kabutops)
2
2
  ========
3
3
 
4
+ Kabutops is a ruby library whichs aims to simplify creating website crawlers.
5
+ You can define what will be crawled and how it will be saved in the short class definition.
6
+
7
+ With Kabutops you can save data easily to:
8
+ * ElasticSearch
9
+ * MongoDB
10
+ * Redis
11
+ * SQL Databases (via Sequel)
12
+
13
+ Example for every kind of database are located
14
+ in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
15
+
4
16
  Installation
5
17
  ------------
6
18
 
@@ -13,7 +25,7 @@ gem install kabutops
13
25
  Or you can put it in your Gemfile
14
26
 
15
27
  ```ruby
16
- gem 'kabutops', '~> 0.0.7'
28
+ gem 'kabutops', '~> 0.0.8'
17
29
  ```
18
30
 
19
31
  Basic example
@@ -136,3 +148,26 @@ FruitCrawler.debug_resource { id: '123', url: '...' }
136
148
 
137
149
  These methods will print out what would be otherwise saved to the
138
150
  database but for this time there is no save to the database.
151
+
152
+ Anonymity ala Tor
153
+ -----------------
154
+
155
+ Anonymity can be easily achieved with [Peasant](https://github.com/reneklacan/peasant) gem.
156
+ By following [this guide](https://github.com/reneklacan/peasant/wiki/How-to-use-Peasant-with-Tor-and-Privoxy-for-scraping)
157
+ you can create proxy instance that will forward requests to
158
+ multiple tor instances.
159
+
160
+ Then use Peasant proxy address in your Crawler class definition
161
+
162
+ ```ruby
163
+ class MyCrawler < Kabutops::Crawler
164
+ ...
165
+ proxy 'localhost', 81818
166
+ ...
167
+ end
168
+ ```
169
+
170
+ License
171
+ -------
172
+
173
+ This library is distributed under the Beerware license.
data/lib/kabutops.rb CHANGED
@@ -10,9 +10,14 @@ require 'mechanize'
10
10
  require 'elasticsearch'
11
11
  require 'redis'
12
12
  require 'redis-namespace'
13
+ require 'mongo'
14
+ require 'sequel'
15
+ require 'mysql2'
13
16
 
14
17
  Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
15
18
 
19
+ require 'kabutops/extensions/includable'
20
+ require 'kabutops/extensions/logging'
16
21
  require 'kabutops/extensions/parameterable'
17
22
  require 'kabutops/extensions/callback_support'
18
23
  require 'kabutops/recipe'
@@ -21,8 +26,12 @@ require 'kabutops/adapters/base'
21
26
  require 'kabutops/adapters/database_adapter'
22
27
  require 'kabutops/adapters/elastic_search'
23
28
  require 'kabutops/adapters/redis'
29
+ require 'kabutops/adapters/mongo'
30
+ require 'kabutops/adapters/sequel'
24
31
  require 'kabutops/crawler_extensions/elastic_search'
25
32
  require 'kabutops/crawler_extensions/redis'
33
+ require 'kabutops/crawler_extensions/mongo'
34
+ require 'kabutops/crawler_extensions/sequel'
26
35
  require 'kabutops/crawler_extensions/pstore_storage'
27
36
  require 'kabutops/crawler_extensions/debugging'
28
37
  require 'kabutops/crawler'
@@ -5,6 +5,10 @@ module Kabutops
5
5
  module Adapters
6
6
 
7
7
  class Base
8
+ def initialize &block
9
+ instance_eval &block if block_given?
10
+ end
11
+
8
12
  def enable_debug
9
13
  @debug = true
10
14
  end
@@ -5,6 +5,7 @@ module Kabutops
5
5
  module Adapters
6
6
 
7
7
  class DatabaseAdapter < Base
8
+ include Extensions::Logging
8
9
  include Extensions::CallbackSupport
9
10
 
10
11
  attr_reader :recipe
@@ -17,10 +18,12 @@ module Kabutops
17
18
  end
18
19
 
19
20
  def process resource, page
21
+ raise 'data block not defined' unless @recipe
22
+
20
23
  result = @recipe.process(resource, page)
21
24
  if debug
22
- puts "#{self.class.to_s} outputs:"
23
- p result.to_hash
25
+ logger.info("#{self.class.to_s} outputs:")
26
+ logger.info(result.to_hash)
24
27
  else
25
28
  store(result)
26
29
  notify(:after_save, result)
@@ -0,0 +1,53 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ module Adapters
6
+
7
+ class Mongo < DatabaseAdapter
8
+ include Extensions::Parameterable
9
+
10
+ params :host, :port, :db, :collection, :user, :password
11
+
12
+ def store result
13
+ existing = collection.find('id' => result[:id])
14
+
15
+ if existing.count > 0
16
+ existing.each do |document|
17
+ collection.update({'_id' => document['_id']}, result.to_hash)
18
+ end
19
+ else
20
+ collection.insert(result.to_hash)
21
+ end
22
+ end
23
+
24
+ def nested?
25
+ true
26
+ end
27
+
28
+ protected
29
+
30
+ def client
31
+ @@client ||= ::Mongo::MongoClient.new(
32
+ params[:host] || 'localhost',
33
+ params[:port] || 27017,
34
+ )
35
+ end
36
+
37
+ def client_db
38
+ @@client_db ||= client.db(params[:db].to_s || 'kabutops')
39
+ if params[:user] && params[:password]
40
+ ok = @@client.authenticate(params[:user], params[:password])
41
+ raise 'mongo authentication failed' unless ok
42
+ end
43
+ @@client_db
44
+ end
45
+
46
+ def collection
47
+ @@collection ||= client_db.collection(params[:collection] || 'kabutops')
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,47 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ module Adapters
6
+
7
+ class Sequel < DatabaseAdapter
8
+ include Extensions::Parameterable
9
+
10
+ params :connect, :type, :host, :port,
11
+ :db, :user, :password, :table
12
+
13
+ def store result
14
+ client_table.insert(result)
15
+ end
16
+
17
+ def nested?
18
+ false
19
+ end
20
+
21
+ protected
22
+
23
+ def client
24
+ return @@client if defined?(@@client)
25
+
26
+ if params[:connect]
27
+ @@client ||= ::Sequel.connect(params[:connect])
28
+ else
29
+ @@client ||= ::Sequel.connect(
30
+ adapter: params[:type] || 'mysql2',
31
+ user: params[:user] || 'root',
32
+ password: params[:password] || 'root',
33
+ host: params[:host] || 'localhost',
34
+ port: params[:port] || 3306,
35
+ database: params[:db] || params[:database] || 'kabutops',
36
+ )
37
+ end
38
+ end
39
+
40
+ def client_table
41
+ @@client_table ||= client[params[:table]]
42
+ end
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -3,10 +3,13 @@
3
3
  module Kabutops
4
4
 
5
5
  class Crawler
6
+ include Extensions::Logging
6
7
  include CrawlerExtensions::Debugging
7
8
  include CrawlerExtensions::PStoreStorage
8
9
  include CrawlerExtensions::ElasticSearch
9
10
  include CrawlerExtensions::Redis
11
+ include CrawlerExtensions::Mongo
12
+ include CrawlerExtensions::Sequel
10
13
  include Sidekiq::Worker
11
14
 
12
15
  class << self
@@ -17,7 +20,7 @@ module Kabutops
17
20
  callbacks :after_crawl
18
21
 
19
22
  def adapters
20
- @adapters || []
23
+ @adapters ||= []
21
24
  end
22
25
 
23
26
  def crawl! collection=nil
@@ -58,6 +61,11 @@ module Kabutops
58
61
  self.class.adapters.each do |adapter|
59
62
  adapter.process(resource, page)
60
63
  end
64
+ rescue Exception => e
65
+ logger.error(e.message)
66
+ logger.error(e.backtrace.join("\n"))
67
+ sleep self.params[:wait] || 0
68
+ raise e
61
69
  end
62
70
 
63
71
  def << resource
@@ -5,13 +5,9 @@ module Kabutops
5
5
  module CrawlerExtensions
6
6
 
7
7
  module Debugging
8
-
9
- def self.included base
10
- base.extend(ClassMethods)
11
- end
8
+ extend Extensions::Includable
12
9
 
13
10
  module ClassMethods
14
-
15
11
  def debug_first count=1
16
12
  params[:collection].take(count).map{ |r| debug_resource(r) }
17
13
  end
@@ -41,9 +37,7 @@ module Kabutops
41
37
  def debug
42
38
  @debug == true
43
39
  end
44
-
45
40
  end
46
-
47
41
  end
48
42
 
49
43
  end
@@ -5,21 +5,13 @@ module Kabutops
5
5
  module CrawlerExtensions
6
6
 
7
7
  module ElasticSearch
8
-
9
- def self.included base
10
- base.extend(ClassMethods)
11
- end
8
+ extend Extensions::Includable
12
9
 
13
10
  module ClassMethods
14
11
  def elasticsearch &block
15
- adapter = Adapters::ElasticSearch.new
16
- adapter.instance_eval &block
17
-
18
- @adapters ||= []
19
- @adapters << adapter
12
+ adapters << Adapters::ElasticSearch.new(&block)
20
13
  end
21
14
  end
22
-
23
15
  end
24
16
 
25
17
  end
@@ -0,0 +1,19 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ module CrawlerExtensions
6
+
7
+ module Mongo
8
+ extend Extensions::Includable
9
+
10
+ module ClassMethods
11
+ def mongo &block
12
+ adapters << Adapters::Mongo.new(&block)
13
+ end
14
+ end
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -5,6 +5,8 @@ module Kabutops
5
5
  module CrawlerExtensions
6
6
 
7
7
  module PStoreStorage
8
+ extend Extensions::Includable
9
+
8
10
  class Storage
9
11
  def initialize path='.kabutopus.config.pstore'
10
12
  @storage ||= PStore.new(path)
@@ -21,22 +23,15 @@ module Kabutops
21
23
  end
22
24
  end
23
25
 
24
- def self.included base
25
- base.extend(ClassMethods)
26
- end
27
-
28
26
  module ClassMethods
29
-
30
27
  def storage
31
28
  @storage ||= Storage.new
32
29
  end
33
-
34
30
  end
35
31
 
36
32
  def storage
37
33
  self.class.storage
38
34
  end
39
-
40
35
  end
41
36
 
42
37
  end
@@ -5,21 +5,13 @@ module Kabutops
5
5
  module CrawlerExtensions
6
6
 
7
7
  module Redis
8
-
9
- def self.included base
10
- base.extend(ClassMethods)
11
- end
8
+ extend Extensions::Includable
12
9
 
13
10
  module ClassMethods
14
11
  def redis &block
15
- adapter = Adapters::Redis.new
16
- adapter.instance_eval &block
17
-
18
- @adapters ||= []
19
- @adapters << adapter
12
+ adapters << Adapters::Redis.new(&block)
20
13
  end
21
14
  end
22
-
23
15
  end
24
16
 
25
17
  end
@@ -0,0 +1,19 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ module CrawlerExtensions
6
+
7
+ module Sequel
8
+ extend Extensions::Includable
9
+
10
+ module ClassMethods
11
+ def sequel &block
12
+ adapters << Adapters::Sequel.new(&block)
13
+ end
14
+ end
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -5,13 +5,7 @@ module Kabutops
5
5
  module Extensions
6
6
 
7
7
  module CallbackSupport
8
-
9
- def self.included base
10
- base.extend(ClassMethods)
11
- base.class_eval do
12
- attr_reader :allowed_callbacks
13
- end
14
- end
8
+ extend Includable
15
9
 
16
10
  class Manager
17
11
  attr_reader :map, :allowed
@@ -22,11 +16,7 @@ module Kabutops
22
16
  end
23
17
 
24
18
  def method_missing name, *args, &block
25
- return super unless block_given?
26
-
27
- unless @allowed.include?(name)
28
- raise "Invalid callback name: #{name}"
29
- end
19
+ return super unless block_given? && @allowed.include?(name)
30
20
 
31
21
  @map[name] ||= []
32
22
  @map[name] << block
@@ -52,15 +42,12 @@ module Kabutops
52
42
  end
53
43
 
54
44
  module ClassMethods
55
-
56
45
  def callbacks *args
57
46
  define_method :allowed_callbacks do
58
47
  args
59
48
  end
60
49
  end
61
-
62
50
  end
63
-
64
51
  end
65
52
 
66
53
  end
@@ -0,0 +1,25 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ # inspired by ActiveSupport::Concern
6
+
7
+ module Includable
8
+ def append_features(base)
9
+ super
10
+ base.extend const_get(:ClassMethods) if const_defined?(:ClassMethods)
11
+ base.class_eval(&@_included) if instance_variable_defined?(:@_included)
12
+ end
13
+
14
+ def included(base = nil, &block)
15
+ if base.nil?
16
+ @_included = block
17
+ else
18
+ super
19
+ end
20
+ end
21
+ end
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,27 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ module Logging
6
+ extend Includable
7
+
8
+ def logger
9
+ self.class.logger
10
+ end
11
+
12
+ module ClassMethods
13
+
14
+ def logger
15
+ return @@logger if defined?(@@logger)
16
+
17
+ @@logger ||= Logger.new(STDOUT)
18
+ #@@logger.level = Logger::WARN
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -5,13 +5,9 @@ module Kabutops
5
5
  module Extensions
6
6
 
7
7
  module Parameterable
8
-
9
- def self.included base
10
- base.extend(ClassMethods)
11
- end
8
+ extend Extensions::Includable
12
9
 
13
10
  module ClassMethods
14
-
15
11
  def params *list
16
12
  list.each do |name|
17
13
  define_method name do |*args|
@@ -28,9 +24,7 @@ module Kabutops
28
24
  @params ||= Hashie::Mash.new
29
25
  end
30
26
  end
31
-
32
27
  end
33
-
34
28
  end
35
29
 
36
30
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.0.7'
4
+ VERSION = '0.0.8'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-20 00:00:00.000000000 Z
11
+ date: 2014-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -136,6 +136,62 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '1.8'
139
+ - !ruby/object:Gem::Dependency
140
+ name: mongo
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '1.10'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.10'
153
+ - !ruby/object:Gem::Dependency
154
+ name: sequel
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '4.11'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '4.11'
167
+ - !ruby/object:Gem::Dependency
168
+ name: bson_ext
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '1.10'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '1.10'
181
+ - !ruby/object:Gem::Dependency
182
+ name: mysql2
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '0.3'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '0.3'
139
195
  - !ruby/object:Gem::Dependency
140
196
  name: rspec
141
197
  requirement: !ruby/object:Gem::Requirement
@@ -177,13 +233,19 @@ files:
177
233
  - lib/kabutops/adapters/base.rb
178
234
  - lib/kabutops/adapters/database_adapter.rb
179
235
  - lib/kabutops/adapters/elastic_search.rb
236
+ - lib/kabutops/adapters/mongo.rb
180
237
  - lib/kabutops/adapters/redis.rb
238
+ - lib/kabutops/adapters/sequel.rb
181
239
  - lib/kabutops/crawler.rb
182
240
  - lib/kabutops/crawler_extensions/debugging.rb
183
241
  - lib/kabutops/crawler_extensions/elastic_search.rb
242
+ - lib/kabutops/crawler_extensions/mongo.rb
184
243
  - lib/kabutops/crawler_extensions/pstore_storage.rb
185
244
  - lib/kabutops/crawler_extensions/redis.rb
245
+ - lib/kabutops/crawler_extensions/sequel.rb
186
246
  - lib/kabutops/extensions/callback_support.rb
247
+ - lib/kabutops/extensions/includable.rb
248
+ - lib/kabutops/extensions/logging.rb
187
249
  - lib/kabutops/extensions/parameterable.rb
188
250
  - lib/kabutops/recipe.rb
189
251
  - lib/kabutops/recipe_item.rb