kabutops 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +67 -37
- data/lib/kabutops.rb +3 -5
- data/lib/kabutops/adapters/database_adapter.rb +3 -0
- data/lib/kabutops/adapters/elastic_search.rb +1 -1
- data/lib/kabutops/crawler.rb +32 -12
- data/lib/kabutops/crawler_extensions/debugging.rb +4 -0
- data/lib/kabutops/extensions/callback_support.rb +38 -0
- data/lib/kabutops/extensions/parameterable.rb +37 -0
- data/lib/kabutops/version.rb +1 -1
- metadata +3 -6
- data/lib/kabutops/adapters/callback.rb +0 -15
- data/lib/kabutops/adapters/mysql.rb +0 -17
- data/lib/kabutops/crawler_extensions/callback.rb +0 -24
- data/lib/kabutops/crawler_extensions/mysql.rb +0 -21
- data/lib/kabutops/parameterable.rb +0 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29f7dfcb4dbed7228907cc636a82c97005144881
|
4
|
+
data.tar.gz: 9d18c9e2c070289695e1bf7f0c7f5ca773263c22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f1961c5a5b72e3f23309f995865281875a96111f05eaa57343f669e1511c89957914e22d6d58b4919610d00a987ac087b040c7e086d44f27abdb3bc8a5fe61e
|
7
|
+
data.tar.gz: c3626b5fbe52eac58b8b88b637bcce0181267718bc580ab17a8f7aaf1931df95fb729646f0539c0a2da044ed4a40708e87eb58307b6e34f5ba19d592f3d9faf7
|
data/README.md
CHANGED
@@ -13,76 +13,105 @@ gem install kabutops
|
|
13
13
|
Or you can put it in your Gemfile
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem 'kabutops', '~> 0.0.
|
16
|
+
gem 'kabutops', '~> 0.0.3'
|
17
17
|
```
|
18
18
|
|
19
19
|
Basic example
|
20
20
|
-------------
|
21
21
|
|
22
|
-
|
22
|
+
Example that will crawl information about gems that start on letter Q or
|
23
|
+
X and save them to the ElasticSearch.
|
23
24
|
|
24
25
|
```ruby
|
25
26
|
require 'kabutops'
|
26
27
|
|
27
|
-
class
|
28
|
-
|
29
|
-
|
30
|
-
collection (1..5).map { |id|
|
28
|
+
class GemListCrawler < Kabutops::Crawler
|
29
|
+
# just two letters with the smallest amount of gems
|
30
|
+
collection ['Q', 'X'].map{ |letter|
|
31
31
|
{
|
32
|
-
|
33
|
-
url: "https://
|
32
|
+
letter: letter,
|
33
|
+
url: "https://rubygems.org/gems?letter=#{letter}"
|
34
34
|
}
|
35
|
-
}
|
35
|
+
}
|
36
36
|
|
37
|
-
proxy '127.0.0.1', 81818
|
38
37
|
cache true
|
38
|
+
wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
|
39
|
+
|
40
|
+
callbacks do
|
41
|
+
after_crawl do |resource, page|
|
42
|
+
links = page.xpath("//a[contains(@href, '/gems?letter=#{resource[:letter]}')]")
|
43
|
+
links.each do |link|
|
44
|
+
self << {
|
45
|
+
letter: resource[:letter],
|
46
|
+
url: "https://rubygems.org#{link['href']}",
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
links = page.xpath("//a[contains(@href, '/gems/')]")
|
51
|
+
links.each do |link|
|
52
|
+
GemCrawler << {
|
53
|
+
letter: resource[:letter],
|
54
|
+
url: "https://rubygems.org#{link['href']}",
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class GemCrawler < Kabutops::Crawler
|
62
|
+
cache true
|
63
|
+
wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
|
39
64
|
|
40
65
|
elasticsearch do
|
41
|
-
index :
|
42
|
-
type :
|
66
|
+
index :gems
|
67
|
+
type :gem
|
43
68
|
|
44
69
|
data do
|
45
|
-
id :
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
70
|
+
id :css, '.title > h2 > a'
|
71
|
+
title :css, '.title > h2 > a'
|
72
|
+
authors :css, '.authors > p'
|
73
|
+
description :css, '#markup > p'
|
74
|
+
|
75
|
+
downloads do
|
76
|
+
total :lambda, ->(resource, page) {
|
77
|
+
page.css('.downloads.counter > span > strong')[0].text.gsub(',', '').to_i
|
78
|
+
}
|
79
|
+
|
80
|
+
current_version :lambda, ->(resource, page) {
|
81
|
+
page.css('.downloads.counter > span > strong')[1].text.gsub(',', '').to_i
|
82
|
+
}
|
55
83
|
end
|
56
84
|
end
|
57
|
-
end
|
58
85
|
|
59
|
-
|
86
|
+
callbacks do
|
87
|
+
after_save do |hash|
|
88
|
+
puts "#{hash[:title]} saved!"
|
89
|
+
end
|
90
|
+
end
|
60
91
|
end
|
61
92
|
end
|
62
93
|
|
63
|
-
|
94
|
+
GemListCrawler.crawl!
|
95
|
+
GemCrawler.crawl!
|
64
96
|
```
|
65
97
|
|
66
98
|
Run it via sidekiq
|
67
99
|
|
68
100
|
```bash
|
69
|
-
bundle exec sidekiq -r ./
|
101
|
+
bundle exec sidekiq -r ./rubygems_crawler.rb -c 1
|
70
102
|
```
|
71
103
|
|
72
|
-
|
73
|
-
stored to the ElasticSearch index named books as a book document.
|
74
|
-
|
75
|
-
One document will look something like this
|
104
|
+
Documents saved in the ElasticSearch will look like this one
|
76
105
|
|
77
106
|
```json
|
78
107
|
{
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
"id": "qiita_mail",
|
109
|
+
"title": "qiita_mail",
|
110
|
+
"authors": "ongaeshi",
|
111
|
+
"description":" Write a gem description",
|
112
|
+
"downloads": {
|
113
|
+
"total": 2493,
|
114
|
+
"current_version": 580
|
86
115
|
}
|
87
116
|
}
|
88
117
|
```
|
@@ -101,6 +130,7 @@ FruitCrawler.debug_random # will take random one
|
|
101
130
|
FruitCrawler.debug_random 3 # will take 3 random resources
|
102
131
|
FruitCrawler.debug_last # will take last from collection
|
103
132
|
FruitCrawler.debug_last 5 # will take last 5 resources
|
133
|
+
FruitCrawler.debug_all # guess what it will do
|
104
134
|
FruitCrawler.debug_resource { id: '123', url: '...' }
|
105
135
|
```
|
106
136
|
|
data/lib/kabutops.rb
CHANGED
@@ -3,21 +3,19 @@ require 'sidekiq'
|
|
3
3
|
require 'cachy'
|
4
4
|
require 'moneta'
|
5
5
|
require 'pstore'
|
6
|
+
require 'mechanize'
|
6
7
|
require 'elasticsearch'
|
7
8
|
|
8
9
|
Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
|
9
10
|
|
10
|
-
require 'kabutops/parameterable'
|
11
|
+
require 'kabutops/extensions/parameterable'
|
12
|
+
require 'kabutops/extensions/callback_support'
|
11
13
|
require 'kabutops/recipe'
|
12
14
|
require 'kabutops/recipe_item'
|
13
15
|
require 'kabutops/adapters/base'
|
14
|
-
require 'kabutops/adapters/callback'
|
15
16
|
require 'kabutops/adapters/database_adapter'
|
16
17
|
require 'kabutops/adapters/elastic_search'
|
17
|
-
require 'kabutops/adapters/mysql'
|
18
|
-
require 'kabutops/crawler_extensions/callback'
|
19
18
|
require 'kabutops/crawler_extensions/elastic_search'
|
20
|
-
require 'kabutops/crawler_extensions/mysql'
|
21
19
|
require 'kabutops/crawler_extensions/pstore_storage'
|
22
20
|
require 'kabutops/crawler_extensions/debugging'
|
23
21
|
require 'kabutops/crawler'
|
@@ -3,6 +3,8 @@ module Kabutops
|
|
3
3
|
module Adapters
|
4
4
|
|
5
5
|
class DatabaseAdapter < Base
|
6
|
+
include Extensions::CallbackSupport
|
7
|
+
|
6
8
|
def data &block
|
7
9
|
@recipe = Recipe.new
|
8
10
|
@recipe.instance_eval &block
|
@@ -15,6 +17,7 @@ module Kabutops
|
|
15
17
|
p result.to_hash
|
16
18
|
else
|
17
19
|
store(result)
|
20
|
+
notify(:after_save, result)
|
18
21
|
end
|
19
22
|
end
|
20
23
|
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -3,30 +3,46 @@ module Kabutops
|
|
3
3
|
class Crawler
|
4
4
|
include CrawlerExtensions::Debugging
|
5
5
|
include CrawlerExtensions::PStoreStorage
|
6
|
-
include CrawlerExtensions::Callback
|
7
6
|
include CrawlerExtensions::ElasticSearch
|
7
|
+
include Sidekiq::Worker
|
8
8
|
|
9
9
|
class << self
|
10
|
-
include Parameterable
|
10
|
+
include Extensions::Parameterable
|
11
|
+
include Extensions::CallbackSupport
|
11
12
|
|
12
|
-
params :collection, :proxy, :cache
|
13
|
+
params :collection, :proxy, :cache, :wait
|
13
14
|
|
14
15
|
def adapters
|
15
|
-
@adapters
|
16
|
+
@adapters || []
|
16
17
|
end
|
17
18
|
|
18
19
|
def crawl! collection=nil
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
@map ||= {}
|
21
|
+
|
22
|
+
if storage[:status].nil?
|
23
|
+
(collection || params[:collection] || []).each do |resource|
|
24
|
+
self << resource
|
24
25
|
end
|
25
26
|
end
|
26
27
|
end
|
27
28
|
|
28
29
|
def << resource
|
29
|
-
|
30
|
+
if debug
|
31
|
+
params[:collection] ||= []
|
32
|
+
params[:collection] << resource
|
33
|
+
return
|
34
|
+
end
|
35
|
+
|
36
|
+
key = resource[:id] || resource[:url]
|
37
|
+
|
38
|
+
if key.nil?
|
39
|
+
raise "url must be specified for resource"
|
40
|
+
elsif @map[key]
|
41
|
+
# resource with an id already in map
|
42
|
+
else
|
43
|
+
perform_async(resource.to_hash)
|
44
|
+
@map[key] = resource
|
45
|
+
end
|
30
46
|
end
|
31
47
|
end
|
32
48
|
|
@@ -35,19 +51,23 @@ module Kabutops
|
|
35
51
|
|
36
52
|
content = Cachy.cache_if(self.class.params.cache, resource[:url]) do
|
37
53
|
agent = Mechanize.new
|
38
|
-
|
54
|
+
agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
|
39
55
|
agent.get(resource[:url]).body
|
40
56
|
end
|
41
57
|
|
42
58
|
page = Nokogiri::HTML(content)
|
43
59
|
|
60
|
+
self.class.notify(:after_crawl, resource, page)
|
61
|
+
|
44
62
|
self.class.adapters.each do |adapter|
|
45
63
|
adapter.process(resource, page)
|
46
64
|
end
|
65
|
+
|
66
|
+
sleep self.class.params[:wait] || 0
|
47
67
|
end
|
48
68
|
|
49
69
|
def << resource
|
50
|
-
self.class
|
70
|
+
self.class << resource
|
51
71
|
end
|
52
72
|
end
|
53
73
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Kabutops
|
2
|
+
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
module CallbackSupport
|
6
|
+
|
7
|
+
class Manager
|
8
|
+
def method_missing name, *args, &block
|
9
|
+
return unless block_given?
|
10
|
+
|
11
|
+
@map ||= Hashie::Mash.new
|
12
|
+
@map[name] ||= []
|
13
|
+
@map[name] << block
|
14
|
+
end
|
15
|
+
|
16
|
+
def notify name, *args
|
17
|
+
return unless @map
|
18
|
+
|
19
|
+
(@map[name] || []).map do |block|
|
20
|
+
block.call(*args)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def callbacks &block
|
26
|
+
@manager ||= Manager.new
|
27
|
+
@manager.instance_eval &block
|
28
|
+
end
|
29
|
+
|
30
|
+
def notify name, *args
|
31
|
+
@manager ||= Manager.new
|
32
|
+
@manager.notify(name, *args)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Kabutops
|
2
|
+
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
module Parameterable
|
6
|
+
|
7
|
+
def self.included base
|
8
|
+
base.extend(ClassMethods)
|
9
|
+
base.class_eval do
|
10
|
+
attr_reader :params
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
|
16
|
+
def params *list
|
17
|
+
return @params if list.empty?
|
18
|
+
|
19
|
+
list.each do |name|
|
20
|
+
define_method name do |*args|
|
21
|
+
@params ||= Hashie::Mash.new
|
22
|
+
if args.size == 1
|
23
|
+
@params[name] = args[0]
|
24
|
+
else
|
25
|
+
@params[name] = args
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
@@ -132,17 +132,14 @@ files:
|
|
132
132
|
- README.md
|
133
133
|
- lib/kabutops.rb
|
134
134
|
- lib/kabutops/adapters/base.rb
|
135
|
-
- lib/kabutops/adapters/callback.rb
|
136
135
|
- lib/kabutops/adapters/database_adapter.rb
|
137
136
|
- lib/kabutops/adapters/elastic_search.rb
|
138
|
-
- lib/kabutops/adapters/mysql.rb
|
139
137
|
- lib/kabutops/crawler.rb
|
140
|
-
- lib/kabutops/crawler_extensions/callback.rb
|
141
138
|
- lib/kabutops/crawler_extensions/debugging.rb
|
142
139
|
- lib/kabutops/crawler_extensions/elastic_search.rb
|
143
|
-
- lib/kabutops/crawler_extensions/mysql.rb
|
144
140
|
- lib/kabutops/crawler_extensions/pstore_storage.rb
|
145
|
-
- lib/kabutops/
|
141
|
+
- lib/kabutops/extensions/callback_support.rb
|
142
|
+
- lib/kabutops/extensions/parameterable.rb
|
146
143
|
- lib/kabutops/recipe.rb
|
147
144
|
- lib/kabutops/recipe_item.rb
|
148
145
|
- lib/kabutops/version.rb
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module Kabutops
|
2
|
-
|
3
|
-
module CrawlerExtensions
|
4
|
-
|
5
|
-
module Callback
|
6
|
-
|
7
|
-
def self.included base
|
8
|
-
base.extend(ClassMethods)
|
9
|
-
end
|
10
|
-
|
11
|
-
module ClassMethods
|
12
|
-
def callback &block
|
13
|
-
adapter = Adapters::Callback.new(block)
|
14
|
-
|
15
|
-
@adapters ||= []
|
16
|
-
@adapters << adapter
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
module Kabutops
|
2
|
-
|
3
|
-
module Parameterable
|
4
|
-
|
5
|
-
def self.included base
|
6
|
-
base.extend(ClassMethods)
|
7
|
-
base.class_eval do
|
8
|
-
attr_reader :params
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
module ClassMethods
|
13
|
-
|
14
|
-
def params *list
|
15
|
-
return @params if list.empty?
|
16
|
-
|
17
|
-
list.each do |name|
|
18
|
-
define_method name do |*args|
|
19
|
-
@params ||= Hashie::Mash.new
|
20
|
-
if args.size == 1
|
21
|
-
@params[name] = args[0]
|
22
|
-
else
|
23
|
-
@params[name] = args
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|