kabutops 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41d09fa8638097298ed1c230a255086949651188
4
- data.tar.gz: 3b9c0f19b9272332c940ac649b184c84e02740b2
3
+ metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
4
+ data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
5
5
  SHA512:
6
- metadata.gz: fbdb727ea93bf2048de9a0773a202d7dedcafab67f2f12e378bed53557d9b1b1855a0af45dbf877f68d69d618fc91fc12c9143e3ae8a339aa0a30636b64e1d73
7
- data.tar.gz: 26ad39464ae24c5082d94b043ce4162e2de70aa025a315d3ba3d2d936325dfa200429dc5e4763f4f43b1659dbac9503d7e1c5a9f9b6e9e4cb1f4709aba1226d9
6
+ metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
7
+ data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
data/README.md CHANGED
@@ -8,7 +8,6 @@ With Kabutops you can save data easily to:
8
8
  * ElasticSearch
9
9
  * MongoDB
10
10
  * Redis
11
- * SQL Databases (via Sequel)
12
11
 
13
12
  Example for every kind of database are located
14
13
  in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
@@ -25,9 +24,11 @@ gem install kabutops
25
24
  Or you can put it in your Gemfile
26
25
 
27
26
  ```ruby
28
- gem 'kabutops', '~> 0.0.10'
27
+ gem 'kabutops', '~> 0.0.11'
29
28
  ```
30
29
 
30
+ You will also need Redis database installed and running.
31
+
31
32
  Basic example
32
33
  -------------
33
34
 
@@ -199,12 +200,28 @@ class MyCrawler < Kabutops::Crawler
199
200
  end
200
201
  ```
201
202
 
203
+ Javascript heavy site
204
+ ---------------------
205
+
206
+ Crawling this kind of sites can be achieved by using non-default agent
207
+ (default is Mechanize.new).
208
+
209
+ ```ruby
210
+ class MyCrawler < Kabutops::Crawler
211
+ ...
212
+ agent Bogeyman::Client.new
213
+ ...
214
+ end
215
+ ```
216
+
217
+ [Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
218
+ is wrapper build upon Phantomjs.
219
+
202
220
  TODO
203
221
  ----
204
222
 
205
- * Watchdog for Mongo and Sequel
206
- * skip_existing for Mongo, Redis and Sequel
207
- * Spider
223
+ * Watchdog for Mongo
224
+ * skip_existing feature for Mongo, Redis
208
225
 
209
226
  License
210
227
  -------
@@ -11,7 +11,6 @@ require 'elasticsearch'
11
11
  require 'redis'
12
12
  require 'redis-namespace'
13
13
  require 'mongo'
14
- require 'sequel'
15
14
  require 'mysql2'
16
15
  require 'logger'
17
16
 
@@ -30,12 +29,11 @@ require 'kabutops/adapters/database_adapter'
30
29
  require 'kabutops/adapters/elastic_search'
31
30
  require 'kabutops/adapters/redis'
32
31
  require 'kabutops/adapters/mongo'
33
- require 'kabutops/adapters/sequel'
34
32
  require 'kabutops/crawler_extensions/elastic_search'
35
33
  require 'kabutops/crawler_extensions/redis'
36
34
  require 'kabutops/crawler_extensions/mongo'
37
- require 'kabutops/crawler_extensions/sequel'
38
35
  require 'kabutops/crawler_extensions/pstore_storage'
39
36
  require 'kabutops/crawler_extensions/debugging'
40
37
  require 'kabutops/crawler'
41
38
  require 'kabutops/watchdog'
39
+ require 'kabutops/spider'
@@ -24,6 +24,11 @@ module Kabutops
24
24
  dev: STDOUT,
25
25
  level: Logger::DEBUG
26
26
  },
27
+ redis: {
28
+ host: 'localhost',
29
+ port: 6379,
30
+ db: 0
31
+ },
27
32
  )
28
33
  end
29
34
  end
@@ -9,14 +9,14 @@ module Kabutops
9
9
  include CrawlerExtensions::ElasticSearch
10
10
  include CrawlerExtensions::Redis
11
11
  include CrawlerExtensions::Mongo
12
- include CrawlerExtensions::Sequel
13
12
  include Sidekiq::Worker
14
13
 
15
14
  class << self
16
15
  include Extensions::Parameterable
17
16
  include Extensions::CallbackSupport
18
17
 
19
- params :collection, :proxy, :cache, :wait, :skip_existing
18
+ params :collection, :proxy, :cache, :wait,
19
+ :skip_existing, :agent
20
20
  callbacks :after_crawl
21
21
 
22
22
  def adapters
@@ -62,29 +62,29 @@ module Kabutops
62
62
 
63
63
  def perform resource
64
64
  resource = Hashie::Mash.new(resource)
65
- adapters = self.class.adapters
66
-
67
- if self.class.params.skip_existing
68
- adapters = self.class.adapters.select do |adapter|
69
- if adapter.respond_to? :find
70
- adapter.find(resource).nil?
71
- else
72
- true
73
- end
74
- end
75
65
 
76
- return if adapters.nil?
66
+ adapters = self.class.adapters.select do |adapter|
67
+ if params.skip_existing && adapter.respond_to?(:find)
68
+ adapter.find(resource).nil?
69
+ else
70
+ true
71
+ end
77
72
  end
78
73
 
74
+ return if adapters.nil?
75
+
79
76
  page = crawl(resource)
80
77
 
81
78
  adapters.each do |adapter|
82
79
  adapter.process(resource, page)
83
80
  end
84
81
  rescue Exception => e
85
- logger.error(e.message)
86
- logger.error(e.backtrace.join("\n"))
87
- sleep self.class.params[:wait] || 0
82
+ unless self.class.debug
83
+ logger.error(e.message)
84
+ logger.error(e.backtrace.join("\n"))
85
+ end
86
+
87
+ sleep params[:wait] || 0
88
88
  raise e
89
89
  end
90
90
 
@@ -94,10 +94,14 @@ module Kabutops
94
94
 
95
95
  protected
96
96
 
97
+ def params
98
+ self.class.params
99
+ end
100
+
97
101
  def crawl resource
98
102
  cache_key = (resource[:id] || resource[:url]).to_s
99
- content = Cachy.cache_if(self.class.params.cache, cache_key) do
100
- sleep self.class.params[:wait] || 0 # wait only if value is not from cache
103
+ content = Cachy.cache_if(params.cache, cache_key) do
104
+ sleep params[:wait] || 0 # wait only if value is not from cache
101
105
  agent.get(resource[:url]).body
102
106
  end
103
107
 
@@ -108,8 +112,8 @@ module Kabutops
108
112
 
109
113
  def agent
110
114
  unless @agent
111
- @agent = Mechanize.new
112
- @agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
115
+ @agent = params[:agent] || Mechanize.new
116
+ @agent.set_proxy(*params[:proxy]) if params[:proxy]
113
117
  end
114
118
 
115
119
  @agent
@@ -0,0 +1,82 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ class Spider < Crawler
6
+ class << self
7
+ params :url
8
+ callbacks :after_crawl, :follow_if
9
+
10
+ def debug_spider
11
+ enable_debug
12
+ self.new.perform({
13
+ url: params[:url]
14
+ })
15
+ end
16
+
17
+ def << resource
18
+ if resource_status(resource).nil?
19
+ resource_status(resource, 'new')
20
+ super
21
+ end
22
+ end
23
+
24
+ def resource_status resource, status=nil
25
+ url_status(resource[:url], status)
26
+ end
27
+
28
+ def url_status url, status=nil
29
+ key = redis_key(url)
30
+
31
+ if status
32
+ redis.set(
33
+ key,
34
+ JSON.dump({
35
+ url: url,
36
+ status: status,
37
+ })
38
+ )
39
+ else
40
+ item = redis.get(key)
41
+ item ? JSON.parse(item)['status'] : nil
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def redis_key string
48
+ Digest::SHA256.hexdigest(string)
49
+ end
50
+
51
+ def redis
52
+ @redis ||= ::Redis::Namespace.new(
53
+ self.to_s,
54
+ redis: ::Redis.new(
55
+ host: Configuration[:redis][:host],
56
+ port: Configuration[:redis][:port],
57
+ db: Configuration[:redis][:db],
58
+ )
59
+ )
60
+ end
61
+ end
62
+
63
+ def crawl resource
64
+ page = super
65
+ after_crawl(resource, page)
66
+ self.class.resource_status(resource, 'done')
67
+ page
68
+ end
69
+
70
+ def after_crawl resource, page
71
+ page.css('a').each do |a|
72
+ follow = self.class.notify(:follow_if, a['href']).any?
73
+ if follow
74
+ self << {
75
+ url: a['href'],
76
+ }
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.0.10'
4
+ VERSION = '0.0.11'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-07 00:00:00.000000000 Z
11
+ date: 2014-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -150,20 +150,6 @@ dependencies:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
152
  version: '1.10'
153
- - !ruby/object:Gem::Dependency
154
- name: sequel
155
- requirement: !ruby/object:Gem::Requirement
156
- requirements:
157
- - - "~>"
158
- - !ruby/object:Gem::Version
159
- version: '4.11'
160
- type: :runtime
161
- prerelease: false
162
- version_requirements: !ruby/object:Gem::Requirement
163
- requirements:
164
- - - "~>"
165
- - !ruby/object:Gem::Version
166
- version: '4.11'
167
153
  - !ruby/object:Gem::Dependency
168
154
  name: bson_ext
169
155
  requirement: !ruby/object:Gem::Requirement
@@ -235,7 +221,6 @@ files:
235
221
  - lib/kabutops/adapters/elastic_search.rb
236
222
  - lib/kabutops/adapters/mongo.rb
237
223
  - lib/kabutops/adapters/redis.rb
238
- - lib/kabutops/adapters/sequel.rb
239
224
  - lib/kabutops/configuration.rb
240
225
  - lib/kabutops/crawler.rb
241
226
  - lib/kabutops/crawler_extensions/debugging.rb
@@ -243,13 +228,13 @@ files:
243
228
  - lib/kabutops/crawler_extensions/mongo.rb
244
229
  - lib/kabutops/crawler_extensions/pstore_storage.rb
245
230
  - lib/kabutops/crawler_extensions/redis.rb
246
- - lib/kabutops/crawler_extensions/sequel.rb
247
231
  - lib/kabutops/extensions/callback_support.rb
248
232
  - lib/kabutops/extensions/includable.rb
249
233
  - lib/kabutops/extensions/logging.rb
250
234
  - lib/kabutops/extensions/parameterable.rb
251
235
  - lib/kabutops/recipe.rb
252
236
  - lib/kabutops/recipe_item.rb
237
+ - lib/kabutops/spider.rb
253
238
  - lib/kabutops/version.rb
254
239
  - lib/kabutops/watchdog.rb
255
240
  homepage: https://github.com/reneklacan/kabutops
@@ -1,47 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- module Adapters
6
-
7
- class Sequel < DatabaseAdapter
8
- include Extensions::Parameterable
9
-
10
- params :connect, :type, :host, :port,
11
- :db, :user, :password, :table
12
-
13
- def store result
14
- client_table.insert(result)
15
- end
16
-
17
- def nested?
18
- false
19
- end
20
-
21
- protected
22
-
23
- def client
24
- return @@client if defined?(@@client)
25
-
26
- if params[:connect]
27
- @@client ||= ::Sequel.connect(params[:connect])
28
- else
29
- @@client ||= ::Sequel.connect(
30
- adapter: params[:type] || 'mysql2',
31
- user: params[:user] || 'root',
32
- password: params[:password] || 'root',
33
- host: params[:host] || 'localhost',
34
- port: params[:port] || 3306,
35
- database: params[:db] || params[:database] || 'kabutops',
36
- )
37
- end
38
- end
39
-
40
- def client_table
41
- @@client_table ||= client[params[:table]]
42
- end
43
- end
44
-
45
- end
46
-
47
- end
@@ -1,19 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- module CrawlerExtensions
6
-
7
- module Sequel
8
- extend Extensions::Includable
9
-
10
- module ClassMethods
11
- def sequel &block
12
- adapters << Adapters::Sequel.new(&block)
13
- end
14
- end
15
- end
16
-
17
- end
18
-
19
- end