kabutops 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -28
- data/lib/kabutops.rb +0 -7
- data/lib/kabutops/adapters/base.rb +3 -3
- data/lib/kabutops/adapters/database_adapter.rb +17 -16
- data/lib/kabutops/adapters/elastic_search.rb +1 -3
- data/lib/kabutops/crawler.rb +1 -5
- data/lib/kabutops/crawler_extensions/debugging.rb +1 -1
- data/lib/kabutops/crawler_extensions/pstore_storage.rb +2 -0
- data/lib/kabutops/extensions/callback_support.rb +15 -11
- data/lib/kabutops/recipe.rb +9 -9
- data/lib/kabutops/recipe_item.rb +16 -13
- data/lib/kabutops/version.rb +1 -1
- metadata +12 -59
- data/lib/kabutops/adapters/mongo.rb +0 -49
- data/lib/kabutops/adapters/redis.rb +0 -37
- data/lib/kabutops/crawler_extensions/mongo.rb +0 -19
- data/lib/kabutops/crawler_extensions/redis.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e931d9a939854fae6497f910de768971907f29a
|
4
|
+
data.tar.gz: 126e34c2c1eeffa8af3dca362a62381f5889cbe0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a869f1388c583d8e2467f096bd6e4702d95832f1f2039253d8b13ec48037e133c9fbbaf68dd44512b69edb4ca4901177757b9ceeedb52efb9aac9587e1ce0a72
|
7
|
+
data.tar.gz: cccb0968bcf9ce251a7ec81c8a8cbb04b5449d19c96dcf2dd58fd69c259b915f17c9e37d7ed926c229066329018b568d0221824f5ec60578eb61238a8172d4b5
|
data/README.md
CHANGED
@@ -1,19 +1,14 @@
|
|
1
|
-
Kabutops [](https://codeclimate.com/github/reneklacan/kabutops) [](https://codeclimate.com/github/reneklacan/kabutops)
|
2
|
-
========
|
1
|
+
# Kabutops [](https://codeclimate.com/github/reneklacan/kabutops) [](https://codeclimate.com/github/reneklacan/kabutops)
|
3
2
|
|
4
3
|
Kabutops is a ruby library whichs aims to simplify creating website crawlers.
|
5
4
|
You can define what will be crawled and how it will be saved in the short class definition.
|
6
5
|
|
7
|
-
With Kabutops you can save data
|
8
|
-
* ElasticSearch
|
9
|
-
* MongoDB
|
10
|
-
* Redis
|
6
|
+
With Kabutops you can easily save data to **ElasticSearch**.
|
11
7
|
|
12
8
|
Example for every kind of database are located
|
13
9
|
in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
|
14
10
|
|
15
|
-
Installation
|
16
|
-
------------
|
11
|
+
## Installation
|
17
12
|
|
18
13
|
You can install it via gem
|
19
14
|
|
@@ -29,8 +24,7 @@ gem 'kabutops', '~> 0.1.1'
|
|
29
24
|
|
30
25
|
You will also need Redis database installed and running.
|
31
26
|
|
32
|
-
Basic example
|
33
|
-
-------------
|
27
|
+
## Basic example
|
34
28
|
|
35
29
|
Example that will crawl information about gems that start on letter Q or
|
36
30
|
X and save them to the ElasticSearch.
|
@@ -129,8 +123,7 @@ Documents saved in the ElasticSearch will look like this one
|
|
129
123
|
}
|
130
124
|
```
|
131
125
|
|
132
|
-
Advanced
|
133
|
-
--------
|
126
|
+
## Advanced
|
134
127
|
|
135
128
|
```ruby
|
136
129
|
class SomeCrawler < Kabutops::Crawler
|
@@ -195,8 +188,7 @@ class SomeCrawler < Kabutops::Crawler
|
|
195
188
|
end
|
196
189
|
```
|
197
190
|
|
198
|
-
Debugging
|
199
|
-
---------
|
191
|
+
## Debugging
|
200
192
|
|
201
193
|
As we all know, crawler can't be written on the first time.
|
202
194
|
|
@@ -216,8 +208,7 @@ FruitCrawler.debug_resource { id: '123', url: '...' }
|
|
216
208
|
These methods will print out what would be otherwise saved to the
|
217
209
|
database but for this time there is no save to the database.
|
218
210
|
|
219
|
-
Staying up to date
|
220
|
-
------------------
|
211
|
+
## Staying up to date
|
221
212
|
|
222
213
|
Note: This feature is currently working only with ElasticSearch
|
223
214
|
|
@@ -248,8 +239,7 @@ GemUpdater.loop
|
|
248
239
|
ruby rubygems_updater.rb
|
249
240
|
```
|
250
241
|
|
251
|
-
Anonymity ala Tor
|
252
|
-
-----------------
|
242
|
+
## Anonymity ala Tor
|
253
243
|
|
254
244
|
Anonymity can be easily achieved with [Peasant](https://github.com/reneklacan/peasant) gem.
|
255
245
|
By following [this guide](https://github.com/reneklacan/peasant/wiki/How-to-use-Peasant-with-Tor-and-Privoxy-for-scraping)
|
@@ -266,8 +256,7 @@ class MyCrawler < Kabutops::Crawler
|
|
266
256
|
end
|
267
257
|
```
|
268
258
|
|
269
|
-
Javascript heavy site
|
270
|
-
---------------------
|
259
|
+
## Javascript heavy site
|
271
260
|
|
272
261
|
Crawling this kind of sites can be achieved by using non-default agent
|
273
262
|
(default is Mechanize.new).
|
@@ -283,13 +272,6 @@ end
|
|
283
272
|
[Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
|
284
273
|
is wrapper build upon Phantomjs.
|
285
274
|
|
286
|
-
|
287
|
-
----
|
288
|
-
|
289
|
-
* Watchdog for Mongo
|
290
|
-
* skip_existing feature for Mongo, Redis
|
291
|
-
|
292
|
-
License
|
293
|
-
-------
|
275
|
+
## License
|
294
276
|
|
295
277
|
This library is distributed under the Beerware license.
|
data/lib/kabutops.rb
CHANGED
@@ -8,9 +8,6 @@ require 'moneta'
|
|
8
8
|
require 'pstore'
|
9
9
|
require 'mechanize'
|
10
10
|
require 'elasticsearch'
|
11
|
-
require 'redis'
|
12
|
-
require 'redis-namespace'
|
13
|
-
require 'mongo'
|
14
11
|
require 'logger'
|
15
12
|
|
16
13
|
require 'kabutops/configuration'
|
@@ -26,11 +23,7 @@ require 'kabutops/recipe_item'
|
|
26
23
|
require 'kabutops/adapters/base'
|
27
24
|
require 'kabutops/adapters/database_adapter'
|
28
25
|
require 'kabutops/adapters/elastic_search'
|
29
|
-
require 'kabutops/adapters/redis'
|
30
|
-
require 'kabutops/adapters/mongo'
|
31
26
|
require 'kabutops/crawler_extensions/elastic_search'
|
32
|
-
require 'kabutops/crawler_extensions/redis'
|
33
|
-
require 'kabutops/crawler_extensions/mongo'
|
34
27
|
require 'kabutops/crawler_extensions/pstore_storage'
|
35
28
|
require 'kabutops/crawler_extensions/debugging'
|
36
29
|
require 'kabutops/crawler'
|
@@ -5,8 +5,8 @@ module Kabutops
|
|
5
5
|
module Adapters
|
6
6
|
|
7
7
|
class Base
|
8
|
-
def initialize
|
9
|
-
|
8
|
+
def initialize
|
9
|
+
yield if block_given?
|
10
10
|
end
|
11
11
|
|
12
12
|
def enable_debug
|
@@ -14,7 +14,7 @@ module Kabutops
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def debug
|
17
|
-
|
17
|
+
!!@debug
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
@@ -18,25 +18,26 @@ module Kabutops
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def process resource, page
|
21
|
-
raise 'data block not defined' unless
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
if debug
|
28
|
-
logger.info("#{self.class.to_s} outputs:")
|
29
|
-
notify(:before_save, result) if save
|
30
|
-
logger.info(save ? result.to_hash : 'not valid for save')
|
31
|
-
notify(:after_save, result) if save
|
32
|
-
elsif save
|
33
|
-
notify(:before_save, result)
|
34
|
-
store(result)
|
35
|
-
notify(:after_save, result)
|
36
|
-
end
|
21
|
+
raise 'data block not defined' unless recipe
|
22
|
+
|
23
|
+
previous = find(resource)
|
24
|
+
|
25
|
+
[recipe.process(resource, page, previous)].flatten.each do |result|
|
26
|
+
process_one(resource, page, result)
|
37
27
|
end
|
38
28
|
end
|
39
29
|
|
30
|
+
def process_one resource, page, result
|
31
|
+
result.update(updated_at: Time.now.to_i)
|
32
|
+
save = (notify(:save_if, resource, page, result) || []).all?
|
33
|
+
|
34
|
+
logger.info("#{self.class.to_s} outputs:") if debug
|
35
|
+
notify(:before_save, result) if save
|
36
|
+
logger.info(save ? result.to_hash : 'not valid for save') if debug
|
37
|
+
store(result) if save && !debug
|
38
|
+
notify(:after_save, result) if save
|
39
|
+
end
|
40
|
+
|
40
41
|
def store result
|
41
42
|
raise NotImplementedError
|
42
43
|
end
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -7,8 +7,6 @@ module Kabutops
|
|
7
7
|
include CrawlerExtensions::Debugging
|
8
8
|
include CrawlerExtensions::PStoreStorage
|
9
9
|
include CrawlerExtensions::ElasticSearch
|
10
|
-
include CrawlerExtensions::Redis
|
11
|
-
include CrawlerExtensions::Mongo
|
12
10
|
include Sidekiq::Worker
|
13
11
|
|
14
12
|
class << self
|
@@ -89,8 +87,6 @@ module Kabutops
|
|
89
87
|
self.class << resource
|
90
88
|
end
|
91
89
|
|
92
|
-
protected
|
93
|
-
|
94
90
|
def params
|
95
91
|
self.class.params
|
96
92
|
end
|
@@ -103,7 +99,7 @@ module Kabutops
|
|
103
99
|
if e.response_code.to_i == 404
|
104
100
|
nil
|
105
101
|
else
|
106
|
-
|
102
|
+
logger.error(e.response_code)
|
107
103
|
raise
|
108
104
|
end
|
109
105
|
end
|
@@ -10,41 +10,45 @@ module Kabutops
|
|
10
10
|
class Manager
|
11
11
|
attr_reader :map, :allowed
|
12
12
|
|
13
|
-
def initialize allowed=
|
14
|
-
@allowed = allowed
|
13
|
+
def initialize allowed=[]
|
14
|
+
@allowed = allowed
|
15
15
|
@map ||= Hashie::Mash.new
|
16
16
|
end
|
17
17
|
|
18
18
|
def method_missing name, *args, &block
|
19
|
-
return super unless block_given? &&
|
19
|
+
return super unless block_given? && allowed.include?(name)
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
map[name] ||= []
|
22
|
+
map[name] << block
|
23
23
|
end
|
24
24
|
|
25
25
|
def notify name, *args
|
26
|
-
|
26
|
+
raise "Not registered as valid callback: #{name}" unless allowed.include?(name)
|
27
|
+
return unless map
|
27
28
|
|
28
|
-
(
|
29
|
+
(map[name] || []).map do |block|
|
29
30
|
block.call(*args)
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
35
|
def callbacks &block
|
35
|
-
|
36
|
-
@manager.instance_eval &block
|
36
|
+
manager.instance_eval(&block)
|
37
37
|
end
|
38
38
|
|
39
39
|
def notify name, *args
|
40
|
+
manager.notify(name, *args)
|
41
|
+
end
|
42
|
+
|
43
|
+
def manager
|
44
|
+
raise 'No callbacks allowed' unless respond_to?(:allowed_callbacks)
|
40
45
|
@manager ||= Manager.new(allowed_callbacks)
|
41
|
-
@manager.notify(name, *args)
|
42
46
|
end
|
43
47
|
|
44
48
|
module ClassMethods
|
45
49
|
def callbacks *args
|
46
50
|
define_method :allowed_callbacks do
|
47
|
-
args
|
51
|
+
args.flatten
|
48
52
|
end
|
49
53
|
end
|
50
54
|
end
|
data/lib/kabutops/recipe.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
module Kabutops
|
4
4
|
|
5
5
|
class Recipe
|
6
|
-
attr_reader :items
|
6
|
+
attr_reader :items, :params, :nested
|
7
7
|
|
8
8
|
def initialize params={}
|
9
9
|
@params = Hashie::Mash.new(params)
|
@@ -14,7 +14,7 @@ module Kabutops
|
|
14
14
|
def method_missing name, *args, &block
|
15
15
|
if block_given?
|
16
16
|
recipe = Recipe.new
|
17
|
-
recipe.instance_eval
|
17
|
+
recipe.instance_eval(&block)
|
18
18
|
@items[name] = RecipeItem.new(:recipe, recipe)
|
19
19
|
@nested = true
|
20
20
|
else
|
@@ -23,21 +23,21 @@ module Kabutops
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def process resource, page
|
26
|
+
def process resource, page, previous
|
27
27
|
if @params[:each]
|
28
|
-
page.xpath(@params[:each]).map{ |n| process_one(resource, n) }
|
28
|
+
page.xpath(@params[:each]).map{ |n| process_one(resource, n, previous) }
|
29
29
|
elsif @params[:each_css]
|
30
|
-
page.css(@params[:each_css]).map{ |n| process_one(resource, n) }
|
30
|
+
page.css(@params[:each_css]).map{ |n| process_one(resource, n, previous) }
|
31
31
|
else
|
32
|
-
process_one(resource, page)
|
32
|
+
process_one(resource, page, previous)
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
36
|
-
def process_one resource, node
|
36
|
+
def process_one resource, node, previous
|
37
37
|
result = Hashie::Mash.new
|
38
38
|
|
39
|
-
|
40
|
-
result[name] = item.process(resource, node)
|
39
|
+
items.each do |name, item|
|
40
|
+
result[name] = item.process(resource, node, previous)
|
41
41
|
end
|
42
42
|
|
43
43
|
result
|
data/lib/kabutops/recipe_item.rb
CHANGED
@@ -3,36 +3,38 @@
|
|
3
3
|
module Kabutops
|
4
4
|
|
5
5
|
class RecipeItem
|
6
|
+
TYPES = [:var, :recipe, :css, :xpath, :lambda, :proc, :const, :static]
|
7
|
+
|
6
8
|
attr_reader :type, :value
|
7
9
|
|
8
10
|
def initialize type, value, convert_to=nil
|
9
|
-
|
11
|
+
raise "Unknown recipe item type '#{type}'" unless TYPES.include?(type.to_sym)
|
12
|
+
|
13
|
+
@type = type.to_sym
|
10
14
|
@value = value
|
11
15
|
@convert_to = convert_to
|
12
16
|
end
|
13
17
|
|
14
|
-
def process resource, page
|
15
|
-
convert(get(resource, page))
|
18
|
+
def process resource, page, previous
|
19
|
+
convert(get(resource, page, previous))
|
16
20
|
end
|
17
21
|
|
18
22
|
protected
|
19
23
|
|
20
|
-
def get resource, page
|
21
|
-
case
|
24
|
+
def get resource, page, previous
|
25
|
+
case type
|
22
26
|
when :var
|
23
|
-
resource[
|
27
|
+
resource[value]
|
24
28
|
when :recipe
|
25
|
-
|
29
|
+
value.process(resource, page, previous)
|
26
30
|
when :css
|
27
|
-
page.css(
|
31
|
+
page.css(value).text.gsub(/\u00a0/, ' ').strip
|
28
32
|
when :xpath
|
29
|
-
page.xpath(
|
33
|
+
page.xpath(value).text.gsub(/\u00a0/, ' ').strip
|
30
34
|
when :lambda, :proc
|
31
|
-
|
35
|
+
value.call(resource, page, previous)
|
32
36
|
when :const, :static
|
33
|
-
|
34
|
-
else
|
35
|
-
raise "unknown recipe item type '#{item.type}'"
|
37
|
+
value
|
36
38
|
end
|
37
39
|
end
|
38
40
|
|
@@ -43,6 +45,7 @@ module Kabutops
|
|
43
45
|
when nil then v
|
44
46
|
when :int then v[/\d+/].to_i
|
45
47
|
when :float then v.gsub(',', '.')[/\d+(\.\d+)?/].to_f
|
48
|
+
else raise "Unknown conversion type '#{@convert_to}'"
|
46
49
|
end
|
47
50
|
end
|
48
51
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-02-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 2.
|
19
|
+
version: '2.7'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 2.
|
26
|
+
version: '2.7'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: cachy
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,34 +94,6 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '3.0'
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: redis
|
99
|
-
requirement: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - "~>"
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '3.0'
|
104
|
-
type: :runtime
|
105
|
-
prerelease: false
|
106
|
-
version_requirements: !ruby/object:Gem::Requirement
|
107
|
-
requirements:
|
108
|
-
- - "~>"
|
109
|
-
- !ruby/object:Gem::Version
|
110
|
-
version: '3.0'
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: redis-namespace
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - "~>"
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '1.4'
|
118
|
-
type: :runtime
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - "~>"
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '1.4'
|
125
97
|
- !ruby/object:Gem::Dependency
|
126
98
|
name: json
|
127
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -137,47 +109,33 @@ dependencies:
|
|
137
109
|
- !ruby/object:Gem::Version
|
138
110
|
version: '1.8'
|
139
111
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
142
|
-
requirements:
|
143
|
-
- - "~>"
|
144
|
-
- !ruby/object:Gem::Version
|
145
|
-
version: '1.10'
|
146
|
-
type: :runtime
|
147
|
-
prerelease: false
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
149
|
-
requirements:
|
150
|
-
- - "~>"
|
151
|
-
- !ruby/object:Gem::Version
|
152
|
-
version: '1.10'
|
153
|
-
- !ruby/object:Gem::Dependency
|
154
|
-
name: bson_ext
|
112
|
+
name: rspec
|
155
113
|
requirement: !ruby/object:Gem::Requirement
|
156
114
|
requirements:
|
157
115
|
- - "~>"
|
158
116
|
- !ruby/object:Gem::Version
|
159
|
-
version: '
|
160
|
-
type: :
|
117
|
+
version: '3.0'
|
118
|
+
type: :development
|
161
119
|
prerelease: false
|
162
120
|
version_requirements: !ruby/object:Gem::Requirement
|
163
121
|
requirements:
|
164
122
|
- - "~>"
|
165
123
|
- !ruby/object:Gem::Version
|
166
|
-
version: '
|
124
|
+
version: '3.0'
|
167
125
|
- !ruby/object:Gem::Dependency
|
168
|
-
name:
|
126
|
+
name: pry-byebug
|
169
127
|
requirement: !ruby/object:Gem::Requirement
|
170
128
|
requirements:
|
171
129
|
- - "~>"
|
172
130
|
- !ruby/object:Gem::Version
|
173
|
-
version: '
|
131
|
+
version: '2.0'
|
174
132
|
type: :development
|
175
133
|
prerelease: false
|
176
134
|
version_requirements: !ruby/object:Gem::Requirement
|
177
135
|
requirements:
|
178
136
|
- - "~>"
|
179
137
|
- !ruby/object:Gem::Version
|
180
|
-
version: '
|
138
|
+
version: '2.0'
|
181
139
|
- !ruby/object:Gem::Dependency
|
182
140
|
name: rspec-mocks
|
183
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -205,15 +163,11 @@ files:
|
|
205
163
|
- lib/kabutops/adapters/base.rb
|
206
164
|
- lib/kabutops/adapters/database_adapter.rb
|
207
165
|
- lib/kabutops/adapters/elastic_search.rb
|
208
|
-
- lib/kabutops/adapters/mongo.rb
|
209
|
-
- lib/kabutops/adapters/redis.rb
|
210
166
|
- lib/kabutops/configuration.rb
|
211
167
|
- lib/kabutops/crawler.rb
|
212
168
|
- lib/kabutops/crawler_extensions/debugging.rb
|
213
169
|
- lib/kabutops/crawler_extensions/elastic_search.rb
|
214
|
-
- lib/kabutops/crawler_extensions/mongo.rb
|
215
170
|
- lib/kabutops/crawler_extensions/pstore_storage.rb
|
216
|
-
- lib/kabutops/crawler_extensions/redis.rb
|
217
171
|
- lib/kabutops/extensions/callback_support.rb
|
218
172
|
- lib/kabutops/extensions/includable.rb
|
219
173
|
- lib/kabutops/extensions/logging.rb
|
@@ -243,10 +197,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
243
197
|
version: '0'
|
244
198
|
requirements: []
|
245
199
|
rubyforge_project:
|
246
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.4.5
|
247
201
|
signing_key:
|
248
202
|
specification_version: 4
|
249
203
|
summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
|
250
204
|
for an anonymity.
|
251
205
|
test_files: []
|
252
|
-
has_rdoc:
|
@@ -1,49 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module Adapters
|
6
|
-
|
7
|
-
class Mongo < DatabaseAdapter
|
8
|
-
include Extensions::Parameterable
|
9
|
-
|
10
|
-
params :host, :port, :db, :collection, :user, :password
|
11
|
-
|
12
|
-
def store result
|
13
|
-
existing = collection.find('id' => result[:id])
|
14
|
-
|
15
|
-
if existing.count > 0
|
16
|
-
existing.each do |document|
|
17
|
-
collection.update({'_id' => document['_id']}, result.to_hash)
|
18
|
-
end
|
19
|
-
else
|
20
|
-
collection.insert(result.to_hash)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
protected
|
25
|
-
|
26
|
-
def client
|
27
|
-
@@client ||= ::Mongo::MongoClient.new(
|
28
|
-
params[:host] || 'localhost',
|
29
|
-
params[:port] || 27017,
|
30
|
-
)
|
31
|
-
end
|
32
|
-
|
33
|
-
def client_db
|
34
|
-
@@client_db ||= client.db(params[:db].to_s || 'kabutops')
|
35
|
-
if params[:user] && params[:password]
|
36
|
-
ok = @@client.authenticate(params[:user], params[:password])
|
37
|
-
raise 'mongo authentication failed' unless ok
|
38
|
-
end
|
39
|
-
@@client_db
|
40
|
-
end
|
41
|
-
|
42
|
-
def collection
|
43
|
-
@@collection ||= client_db.collection(params[:collection] || 'kabutops')
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
end
|
@@ -1,37 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module Adapters
|
6
|
-
|
7
|
-
class Redis < DatabaseAdapter
|
8
|
-
include Extensions::Parameterable
|
9
|
-
|
10
|
-
params :host, :port, :namespace, :db, :password
|
11
|
-
|
12
|
-
def store result
|
13
|
-
client[result[:id]] = JSON.dump(result.to_hash)
|
14
|
-
end
|
15
|
-
|
16
|
-
def find resource
|
17
|
-
client[resource[:id] || resource[:url]]
|
18
|
-
end
|
19
|
-
|
20
|
-
protected
|
21
|
-
|
22
|
-
def client
|
23
|
-
@@client ||= ::Redis::Namespace.new(
|
24
|
-
params[:namespace] || 'kabutops',
|
25
|
-
redis: ::Redis.new(
|
26
|
-
host: params[:host],
|
27
|
-
port: params[:port],
|
28
|
-
db: params[:db],
|
29
|
-
password: params[:password],
|
30
|
-
)
|
31
|
-
)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module CrawlerExtensions
|
6
|
-
|
7
|
-
module Mongo
|
8
|
-
extend Extensions::Includable
|
9
|
-
|
10
|
-
module ClassMethods
|
11
|
-
def mongo &block
|
12
|
-
adapters << Adapters::Mongo.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module Kabutops
|
4
|
-
|
5
|
-
module CrawlerExtensions
|
6
|
-
|
7
|
-
module Redis
|
8
|
-
extend Extensions::Includable
|
9
|
-
|
10
|
-
module ClassMethods
|
11
|
-
def redis &block
|
12
|
-
adapters << Adapters::Redis.new(&block)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|