kabutops 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +40 -1
- data/lib/kabutops.rb +4 -0
- data/lib/kabutops/adapters/database_adapter.rb +7 -3
- data/lib/kabutops/adapters/elastic_search.rb +50 -0
- data/lib/kabutops/configuration.rb +32 -0
- data/lib/kabutops/crawler.rb +31 -4
- data/lib/kabutops/extensions/logging.rb +3 -2
- data/lib/kabutops/version.rb +1 -1
- data/lib/kabutops/watchdog.rb +69 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d0738998c869017902c5ae5da1be3dfdf6c834d
|
4
|
+
data.tar.gz: 12a286d8d3adeffd0bc698c80d5a761b41e4b0ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 779f59ca039b19f6c17a63f53a4adc34de617c4e1143ab47f35e13c0c6cc731101961ade348e0046f907a39977367f5d124689a2d4faeedc43aea7fcc02d531a
|
7
|
+
data.tar.gz: dfeecde60aec5677485123704ef8c1e64e943c944ae340bbb994bdce5a8cd2d6581fd49353f2dfc5427913fec692088631c3d9e76bbbe4bf8f47c890b20223df
|
data/README.md
CHANGED
@@ -25,7 +25,7 @@ gem install kabutops
|
|
25
25
|
Or you can put it in your Gemfile
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
gem 'kabutops', '~> 0.0.
|
28
|
+
gem 'kabutops', '~> 0.0.9'
|
29
29
|
```
|
30
30
|
|
31
31
|
Basic example
|
@@ -149,6 +149,38 @@ FruitCrawler.debug_resource { id: '123', url: '...' }
|
|
149
149
|
These methods will print out what would be otherwise saved to the
|
150
150
|
database but for this time there is no save to the database.
|
151
151
|
|
152
|
+
Staying up to date
|
153
|
+
------------------
|
154
|
+
|
155
|
+
Note: This feature is currently working only with ElasticSearch
|
156
|
+
|
157
|
+
For this purpore there is a Watchdog. Updater have to inherit from
|
158
|
+
this class and this class can be run as a worker via sidekiq or as a
|
159
|
+
plain ruby script as you can see below.
|
160
|
+
|
161
|
+
```ruby
|
162
|
+
class GemUpdater < Kabutops::Watchdog
|
163
|
+
crawler GemCrawler
|
164
|
+
freshness 1*24*60*60 # 1 day
|
165
|
+
wait 5
|
166
|
+
|
167
|
+
callbacks do
|
168
|
+
on_outdated do |resource|
|
169
|
+
puts "#{resource[:title]} outdated!"
|
170
|
+
GemCrawler << {
|
171
|
+
url: resource[:url],
|
172
|
+
}
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
GemUpdater.loop
|
178
|
+
```
|
179
|
+
|
180
|
+
```bash
|
181
|
+
ruby rubygems_updater.rb
|
182
|
+
```
|
183
|
+
|
152
184
|
Anonymity ala Tor
|
153
185
|
-----------------
|
154
186
|
|
@@ -167,6 +199,13 @@ class MyCrawler < Kabutops::Crawler
|
|
167
199
|
end
|
168
200
|
```
|
169
201
|
|
202
|
+
TODO
|
203
|
+
----
|
204
|
+
|
205
|
+
* Watchdog for Mongo and Sequel
|
206
|
+
* skip_existing for Mongo, Redis and Sequel
|
207
|
+
* Spider
|
208
|
+
|
170
209
|
License
|
171
210
|
-------
|
172
211
|
|
data/lib/kabutops.rb
CHANGED
@@ -13,6 +13,9 @@ require 'redis-namespace'
|
|
13
13
|
require 'mongo'
|
14
14
|
require 'sequel'
|
15
15
|
require 'mysql2'
|
16
|
+
require 'logger'
|
17
|
+
|
18
|
+
require 'kabutops/configuration'
|
16
19
|
|
17
20
|
Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
|
18
21
|
|
@@ -35,3 +38,4 @@ require 'kabutops/crawler_extensions/sequel'
|
|
35
38
|
require 'kabutops/crawler_extensions/pstore_storage'
|
36
39
|
require 'kabutops/crawler_extensions/debugging'
|
37
40
|
require 'kabutops/crawler'
|
41
|
+
require 'kabutops/watchdog'
|
@@ -10,7 +10,7 @@ module Kabutops
|
|
10
10
|
|
11
11
|
attr_reader :recipe
|
12
12
|
|
13
|
-
callbacks :after_save
|
13
|
+
callbacks :after_save, :save_if
|
14
14
|
|
15
15
|
def data &block
|
16
16
|
@recipe = Recipe.new
|
@@ -21,10 +21,14 @@ module Kabutops
|
|
21
21
|
raise 'data block not defined' unless @recipe
|
22
22
|
|
23
23
|
result = @recipe.process(resource, page)
|
24
|
+
result.update(updated_at: Time.now.to_i)
|
25
|
+
|
26
|
+
save = (notify(:save_if, resource, page, result) || []).all?
|
27
|
+
|
24
28
|
if debug
|
25
29
|
logger.info("#{self.class.to_s} outputs:")
|
26
|
-
logger.info(result.to_hash)
|
27
|
-
|
30
|
+
logger.info(save ? result.to_hash : 'not valid for save')
|
31
|
+
elsif save
|
28
32
|
store(result)
|
29
33
|
notify(:after_save, result)
|
30
34
|
end
|
@@ -18,6 +18,56 @@ module Kabutops
|
|
18
18
|
)
|
19
19
|
end
|
20
20
|
|
21
|
+
def find resource
|
22
|
+
result = client.search(
|
23
|
+
index: params[:index] || 'default',
|
24
|
+
body: {
|
25
|
+
query: {
|
26
|
+
filtered: {
|
27
|
+
filter: {
|
28
|
+
or: [
|
29
|
+
{ term: { id: resource[:id] || resource[:url] } },
|
30
|
+
{ term: { url: resource[:url] } },
|
31
|
+
]
|
32
|
+
},
|
33
|
+
},
|
34
|
+
},
|
35
|
+
},
|
36
|
+
size: 5,
|
37
|
+
)
|
38
|
+
result['hits']['hits'].map{ |hit| hit['_source'] }.first
|
39
|
+
end
|
40
|
+
|
41
|
+
def find_outdated freshness
|
42
|
+
result = client.search(
|
43
|
+
index: params[:index] || 'default',
|
44
|
+
body: {
|
45
|
+
query: {
|
46
|
+
filtered: {
|
47
|
+
filter: {
|
48
|
+
and: [
|
49
|
+
{
|
50
|
+
or: [
|
51
|
+
{ range: { updated_at: { lte: Time.now.to_i - freshness } } },
|
52
|
+
{ missing: { field: 'updated_at' } },
|
53
|
+
]
|
54
|
+
},
|
55
|
+
{
|
56
|
+
or: [
|
57
|
+
{ range: { scheduled_update_at: { lte: Time.now.to_i - 3600 } } },
|
58
|
+
{ missing: { field: 'scheduled_update_at' } },
|
59
|
+
]
|
60
|
+
},
|
61
|
+
]
|
62
|
+
},
|
63
|
+
},
|
64
|
+
},
|
65
|
+
},
|
66
|
+
size: 5,
|
67
|
+
)
|
68
|
+
result['hits']['hits'].map{ |hit| hit['_source'] }
|
69
|
+
end
|
70
|
+
|
21
71
|
def nested?
|
22
72
|
true
|
23
73
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module Kabutops
|
4
|
+
|
5
|
+
class Configuration
|
6
|
+
class << self
|
7
|
+
def config *args, &block
|
8
|
+
configuration.instance_eval &block
|
9
|
+
end
|
10
|
+
|
11
|
+
def [] key
|
12
|
+
configuration[key]
|
13
|
+
end
|
14
|
+
|
15
|
+
def []= key, value
|
16
|
+
configuration[key] = value
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def configuration
|
22
|
+
@configuration ||= Hashie::Mash.new(
|
23
|
+
logger: {
|
24
|
+
dev: STDOUT,
|
25
|
+
level: Logger::DEBUG
|
26
|
+
},
|
27
|
+
)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -16,20 +16,32 @@ module Kabutops
|
|
16
16
|
include Extensions::Parameterable
|
17
17
|
include Extensions::CallbackSupport
|
18
18
|
|
19
|
-
params :collection, :proxy, :cache, :wait
|
19
|
+
params :collection, :proxy, :cache, :wait, :skip_existing
|
20
20
|
callbacks :after_crawl
|
21
21
|
|
22
22
|
def adapters
|
23
23
|
@adapters ||= []
|
24
24
|
end
|
25
25
|
|
26
|
+
def reset!
|
27
|
+
storage[:status] = nil
|
28
|
+
end
|
29
|
+
|
26
30
|
def crawl! collection=nil
|
31
|
+
reset!
|
32
|
+
crawl(collection)
|
33
|
+
end
|
34
|
+
|
35
|
+
def crawl collection=nil
|
27
36
|
@map ||= Hashie::Mash.new
|
28
37
|
|
29
38
|
if storage[:status].nil?
|
30
39
|
(collection || params[:collection] || []).each do |resource|
|
31
40
|
self << resource
|
32
41
|
end
|
42
|
+
storage[:status] = :in_progress
|
43
|
+
elsif storage[:status] == :in_progress
|
44
|
+
# pass
|
33
45
|
end
|
34
46
|
end
|
35
47
|
|
@@ -56,15 +68,29 @@ module Kabutops
|
|
56
68
|
|
57
69
|
def perform resource
|
58
70
|
resource = Hashie::Mash.new(resource)
|
71
|
+
adapters = self.class.adapters
|
72
|
+
|
73
|
+
if self.class.params.skip_existing
|
74
|
+
adapters = self.class.adapters.select do |adapter|
|
75
|
+
if adapter.respond_to? :find
|
76
|
+
adapter.find(resource).nil?
|
77
|
+
else
|
78
|
+
true
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
return if adapters.nil?
|
83
|
+
end
|
84
|
+
|
59
85
|
page = crawl(resource)
|
60
86
|
|
61
|
-
|
87
|
+
adapters.each do |adapter|
|
62
88
|
adapter.process(resource, page)
|
63
89
|
end
|
64
90
|
rescue Exception => e
|
65
91
|
logger.error(e.message)
|
66
92
|
logger.error(e.backtrace.join("\n"))
|
67
|
-
sleep self.params[:wait] || 0
|
93
|
+
sleep self.class.params[:wait] || 0
|
68
94
|
raise e
|
69
95
|
end
|
70
96
|
|
@@ -75,7 +101,8 @@ module Kabutops
|
|
75
101
|
protected
|
76
102
|
|
77
103
|
def crawl resource
|
78
|
-
|
104
|
+
cache_key = (resource[:id] || resource[:url]).to_s
|
105
|
+
content = Cachy.cache_if(self.class.params.cache, cache_key) do
|
79
106
|
sleep self.class.params[:wait] || 0 # wait only if value is not from cache
|
80
107
|
agent.get(resource[:url]).body
|
81
108
|
end
|
@@ -14,8 +14,9 @@ module Kabutops
|
|
14
14
|
def logger
|
15
15
|
return @@logger if defined?(@@logger)
|
16
16
|
|
17
|
-
@@logger
|
18
|
-
|
17
|
+
@@logger = Logger.new(Configuration[:logger][:dev])
|
18
|
+
@@logger.level = Configuration[:logger][:level]
|
19
|
+
@@logger
|
19
20
|
end
|
20
21
|
|
21
22
|
end
|
data/lib/kabutops/version.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
module Kabutops
|
2
|
+
class Watchdog
|
3
|
+
include Extensions::Logging
|
4
|
+
include Sidekiq::Worker
|
5
|
+
|
6
|
+
class << self
|
7
|
+
include Extensions::Parameterable
|
8
|
+
include Extensions::CallbackSupport
|
9
|
+
|
10
|
+
params :crawler, :freshness, :wait
|
11
|
+
callbacks :on_outdated
|
12
|
+
|
13
|
+
def check!
|
14
|
+
perform_async
|
15
|
+
end
|
16
|
+
|
17
|
+
def check
|
18
|
+
new.check
|
19
|
+
end
|
20
|
+
|
21
|
+
def loop
|
22
|
+
loop do
|
23
|
+
sleep self.params[:wait] || 5
|
24
|
+
check
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def check
|
30
|
+
logger.info "#{self.class} check started"
|
31
|
+
|
32
|
+
outdated_resources.each do |resource|
|
33
|
+
resource.update(scheduled_update_at: Time.now.to_i)
|
34
|
+
|
35
|
+
adapters.each do |adapter|
|
36
|
+
adapter.store(resource)
|
37
|
+
end
|
38
|
+
|
39
|
+
self.class.notify(:on_outdated, resource)
|
40
|
+
end
|
41
|
+
|
42
|
+
logger.info "#{self.class} check finished"
|
43
|
+
end
|
44
|
+
|
45
|
+
def perform
|
46
|
+
check
|
47
|
+
sleep self.class.params[:wait] || 5
|
48
|
+
self.class.perform_async
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
def outdated_resources
|
54
|
+
adapters.map{ |a| a.find_outdated(freshness) }
|
55
|
+
.flatten
|
56
|
+
.uniq
|
57
|
+
.reject{ |r| (r[:scheduled_update_at] || 0) > Time.now.to_i - 3600 }
|
58
|
+
.map{ |r| Hashie::Mash.new(r) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def adapters
|
62
|
+
self.class.params.crawler.adapters
|
63
|
+
end
|
64
|
+
|
65
|
+
def freshness
|
66
|
+
self.class.params.freshness
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -236,6 +236,7 @@ files:
|
|
236
236
|
- lib/kabutops/adapters/mongo.rb
|
237
237
|
- lib/kabutops/adapters/redis.rb
|
238
238
|
- lib/kabutops/adapters/sequel.rb
|
239
|
+
- lib/kabutops/configuration.rb
|
239
240
|
- lib/kabutops/crawler.rb
|
240
241
|
- lib/kabutops/crawler_extensions/debugging.rb
|
241
242
|
- lib/kabutops/crawler_extensions/elastic_search.rb
|
@@ -250,6 +251,7 @@ files:
|
|
250
251
|
- lib/kabutops/recipe.rb
|
251
252
|
- lib/kabutops/recipe_item.rb
|
252
253
|
- lib/kabutops/version.rb
|
254
|
+
- lib/kabutops/watchdog.rb
|
253
255
|
homepage: https://github.com/reneklacan/kabutops
|
254
256
|
licenses:
|
255
257
|
- Beerware
|