kabutops 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 23b94bb24325c0b57ed6fdbc05a94d87484e319f
4
- data.tar.gz: 7175b6f9e4c0dfb1c3fad129a2574d4234e31ce2
3
+ metadata.gz: 5d0738998c869017902c5ae5da1be3dfdf6c834d
4
+ data.tar.gz: 12a286d8d3adeffd0bc698c80d5a761b41e4b0ff
5
5
  SHA512:
6
- metadata.gz: 13e0e68d4f333526c7a9cfa23f5555df1dfa8defac859dc40317d13e5bd5bbbb981a7041ea3f0fb0845ff27ed908be23481f2e20621332c6186aaab384ef3fc1
7
- data.tar.gz: b4081eaee098485fd3a92bb37e7d4b737359ec6cbdfcd3bb9131fecfee671134b3c149dee7dbf4afde34dfc3ab685636254762f33bc0b6e3d653364c7d852994
6
+ metadata.gz: 779f59ca039b19f6c17a63f53a4adc34de617c4e1143ab47f35e13c0c6cc731101961ade348e0046f907a39977367f5d124689a2d4faeedc43aea7fcc02d531a
7
+ data.tar.gz: dfeecde60aec5677485123704ef8c1e64e943c944ae340bbb994bdce5a8cd2d6581fd49353f2dfc5427913fec692088631c3d9e76bbbe4bf8f47c890b20223df
data/README.md CHANGED
@@ -25,7 +25,7 @@ gem install kabutops
25
25
  Or you can put it in your Gemfile
26
26
 
27
27
  ```ruby
28
- gem 'kabutops', '~> 0.0.8'
28
+ gem 'kabutops', '~> 0.0.9'
29
29
  ```
30
30
 
31
31
  Basic example
@@ -149,6 +149,38 @@ FruitCrawler.debug_resource { id: '123', url: '...' }
149
149
  These methods will print out what would be otherwise saved to the
150
150
  database but for this time there is no save to the database.
151
151
 
152
+ Staying up to date
153
+ ------------------
154
+
155
+ Note: This feature is currently working only with ElasticSearch
156
+
157
+ For this purpore there is a Watchdog. Updater have to inherit from
158
+ this class and this class can be run as a worker via sidekiq or as a
159
+ plain ruby script as you can see below.
160
+
161
+ ```ruby
162
+ class GemUpdater < Kabutops::Watchdog
163
+ crawler GemCrawler
164
+ freshness 1*24*60*60 # 1 day
165
+ wait 5
166
+
167
+ callbacks do
168
+ on_outdated do |resource|
169
+ puts "#{resource[:title]} outdated!"
170
+ GemCrawler << {
171
+ url: resource[:url],
172
+ }
173
+ end
174
+ end
175
+ end
176
+
177
+ GemUpdater.loop
178
+ ```
179
+
180
+ ```bash
181
+ ruby rubygems_updater.rb
182
+ ```
183
+
152
184
  Anonymity ala Tor
153
185
  -----------------
154
186
 
@@ -167,6 +199,13 @@ class MyCrawler < Kabutops::Crawler
167
199
  end
168
200
  ```
169
201
 
202
+ TODO
203
+ ----
204
+
205
+ * Watchdog for Mongo and Sequel
206
+ * skip_existing for Mongo, Redis and Sequel
207
+ * Spider
208
+
170
209
  License
171
210
  -------
172
211
 
data/lib/kabutops.rb CHANGED
@@ -13,6 +13,9 @@ require 'redis-namespace'
13
13
  require 'mongo'
14
14
  require 'sequel'
15
15
  require 'mysql2'
16
+ require 'logger'
17
+
18
+ require 'kabutops/configuration'
16
19
 
17
20
  Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
18
21
 
@@ -35,3 +38,4 @@ require 'kabutops/crawler_extensions/sequel'
35
38
  require 'kabutops/crawler_extensions/pstore_storage'
36
39
  require 'kabutops/crawler_extensions/debugging'
37
40
  require 'kabutops/crawler'
41
+ require 'kabutops/watchdog'
@@ -10,7 +10,7 @@ module Kabutops
10
10
 
11
11
  attr_reader :recipe
12
12
 
13
- callbacks :after_save
13
+ callbacks :after_save, :save_if
14
14
 
15
15
  def data &block
16
16
  @recipe = Recipe.new
@@ -21,10 +21,14 @@ module Kabutops
21
21
  raise 'data block not defined' unless @recipe
22
22
 
23
23
  result = @recipe.process(resource, page)
24
+ result.update(updated_at: Time.now.to_i)
25
+
26
+ save = (notify(:save_if, resource, page, result) || []).all?
27
+
24
28
  if debug
25
29
  logger.info("#{self.class.to_s} outputs:")
26
- logger.info(result.to_hash)
27
- else
30
+ logger.info(save ? result.to_hash : 'not valid for save')
31
+ elsif save
28
32
  store(result)
29
33
  notify(:after_save, result)
30
34
  end
@@ -18,6 +18,56 @@ module Kabutops
18
18
  )
19
19
  end
20
20
 
21
+ def find resource
22
+ result = client.search(
23
+ index: params[:index] || 'default',
24
+ body: {
25
+ query: {
26
+ filtered: {
27
+ filter: {
28
+ or: [
29
+ { term: { id: resource[:id] || resource[:url] } },
30
+ { term: { url: resource[:url] } },
31
+ ]
32
+ },
33
+ },
34
+ },
35
+ },
36
+ size: 5,
37
+ )
38
+ result['hits']['hits'].map{ |hit| hit['_source'] }.first
39
+ end
40
+
41
+ def find_outdated freshness
42
+ result = client.search(
43
+ index: params[:index] || 'default',
44
+ body: {
45
+ query: {
46
+ filtered: {
47
+ filter: {
48
+ and: [
49
+ {
50
+ or: [
51
+ { range: { updated_at: { lte: Time.now.to_i - freshness } } },
52
+ { missing: { field: 'updated_at' } },
53
+ ]
54
+ },
55
+ {
56
+ or: [
57
+ { range: { scheduled_update_at: { lte: Time.now.to_i - 3600 } } },
58
+ { missing: { field: 'scheduled_update_at' } },
59
+ ]
60
+ },
61
+ ]
62
+ },
63
+ },
64
+ },
65
+ },
66
+ size: 5,
67
+ )
68
+ result['hits']['hits'].map{ |hit| hit['_source'] }
69
+ end
70
+
21
71
  def nested?
22
72
  true
23
73
  end
@@ -0,0 +1,32 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module Kabutops
4
+
5
+ class Configuration
6
+ class << self
7
+ def config *args, &block
8
+ configuration.instance_eval &block
9
+ end
10
+
11
+ def [] key
12
+ configuration[key]
13
+ end
14
+
15
+ def []= key, value
16
+ configuration[key] = value
17
+ end
18
+
19
+ protected
20
+
21
+ def configuration
22
+ @configuration ||= Hashie::Mash.new(
23
+ logger: {
24
+ dev: STDOUT,
25
+ level: Logger::DEBUG
26
+ },
27
+ )
28
+ end
29
+ end
30
+ end
31
+
32
+ end
@@ -16,20 +16,32 @@ module Kabutops
16
16
  include Extensions::Parameterable
17
17
  include Extensions::CallbackSupport
18
18
 
19
- params :collection, :proxy, :cache, :wait
19
+ params :collection, :proxy, :cache, :wait, :skip_existing
20
20
  callbacks :after_crawl
21
21
 
22
22
  def adapters
23
23
  @adapters ||= []
24
24
  end
25
25
 
26
+ def reset!
27
+ storage[:status] = nil
28
+ end
29
+
26
30
  def crawl! collection=nil
31
+ reset!
32
+ crawl(collection)
33
+ end
34
+
35
+ def crawl collection=nil
27
36
  @map ||= Hashie::Mash.new
28
37
 
29
38
  if storage[:status].nil?
30
39
  (collection || params[:collection] || []).each do |resource|
31
40
  self << resource
32
41
  end
42
+ storage[:status] = :in_progress
43
+ elsif storage[:status] == :in_progress
44
+ # pass
33
45
  end
34
46
  end
35
47
 
@@ -56,15 +68,29 @@ module Kabutops
56
68
 
57
69
  def perform resource
58
70
  resource = Hashie::Mash.new(resource)
71
+ adapters = self.class.adapters
72
+
73
+ if self.class.params.skip_existing
74
+ adapters = self.class.adapters.select do |adapter|
75
+ if adapter.respond_to? :find
76
+ adapter.find(resource).nil?
77
+ else
78
+ true
79
+ end
80
+ end
81
+
82
+ return if adapters.nil?
83
+ end
84
+
59
85
  page = crawl(resource)
60
86
 
61
- self.class.adapters.each do |adapter|
87
+ adapters.each do |adapter|
62
88
  adapter.process(resource, page)
63
89
  end
64
90
  rescue Exception => e
65
91
  logger.error(e.message)
66
92
  logger.error(e.backtrace.join("\n"))
67
- sleep self.params[:wait] || 0
93
+ sleep self.class.params[:wait] || 0
68
94
  raise e
69
95
  end
70
96
 
@@ -75,7 +101,8 @@ module Kabutops
75
101
  protected
76
102
 
77
103
  def crawl resource
78
- content = Cachy.cache_if(self.class.params.cache, resource[:url]) do
104
+ cache_key = (resource[:id] || resource[:url]).to_s
105
+ content = Cachy.cache_if(self.class.params.cache, cache_key) do
79
106
  sleep self.class.params[:wait] || 0 # wait only if value is not from cache
80
107
  agent.get(resource[:url]).body
81
108
  end
@@ -14,8 +14,9 @@ module Kabutops
14
14
  def logger
15
15
  return @@logger if defined?(@@logger)
16
16
 
17
- @@logger ||= Logger.new(STDOUT)
18
- #@@logger.level = Logger::WARN
17
+ @@logger = Logger.new(Configuration[:logger][:dev])
18
+ @@logger.level = Configuration[:logger][:level]
19
+ @@logger
19
20
  end
20
21
 
21
22
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.0.8'
4
+ VERSION = '0.0.9'
5
5
  end
@@ -0,0 +1,69 @@
1
+ module Kabutops
2
+ class Watchdog
3
+ include Extensions::Logging
4
+ include Sidekiq::Worker
5
+
6
+ class << self
7
+ include Extensions::Parameterable
8
+ include Extensions::CallbackSupport
9
+
10
+ params :crawler, :freshness, :wait
11
+ callbacks :on_outdated
12
+
13
+ def check!
14
+ perform_async
15
+ end
16
+
17
+ def check
18
+ new.check
19
+ end
20
+
21
+ def loop
22
+ loop do
23
+ sleep self.params[:wait] || 5
24
+ check
25
+ end
26
+ end
27
+ end
28
+
29
+ def check
30
+ logger.info "#{self.class} check started"
31
+
32
+ outdated_resources.each do |resource|
33
+ resource.update(scheduled_update_at: Time.now.to_i)
34
+
35
+ adapters.each do |adapter|
36
+ adapter.store(resource)
37
+ end
38
+
39
+ self.class.notify(:on_outdated, resource)
40
+ end
41
+
42
+ logger.info "#{self.class} check finished"
43
+ end
44
+
45
+ def perform
46
+ check
47
+ sleep self.class.params[:wait] || 5
48
+ self.class.perform_async
49
+ end
50
+
51
+ protected
52
+
53
+ def outdated_resources
54
+ adapters.map{ |a| a.find_outdated(freshness) }
55
+ .flatten
56
+ .uniq
57
+ .reject{ |r| (r[:scheduled_update_at] || 0) > Time.now.to_i - 3600 }
58
+ .map{ |r| Hashie::Mash.new(r) }
59
+ end
60
+
61
+ def adapters
62
+ self.class.params.crawler.adapters
63
+ end
64
+
65
+ def freshness
66
+ self.class.params.freshness
67
+ end
68
+ end
69
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-22 00:00:00.000000000 Z
11
+ date: 2014-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -236,6 +236,7 @@ files:
236
236
  - lib/kabutops/adapters/mongo.rb
237
237
  - lib/kabutops/adapters/redis.rb
238
238
  - lib/kabutops/adapters/sequel.rb
239
+ - lib/kabutops/configuration.rb
239
240
  - lib/kabutops/crawler.rb
240
241
  - lib/kabutops/crawler_extensions/debugging.rb
241
242
  - lib/kabutops/crawler_extensions/elastic_search.rb
@@ -250,6 +251,7 @@ files:
250
251
  - lib/kabutops/recipe.rb
251
252
  - lib/kabutops/recipe_item.rb
252
253
  - lib/kabutops/version.rb
254
+ - lib/kabutops/watchdog.rb
253
255
  homepage: https://github.com/reneklacan/kabutops
254
256
  licenses:
255
257
  - Beerware