kabutops 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +67 -37
- data/lib/kabutops.rb +3 -5
- data/lib/kabutops/adapters/database_adapter.rb +3 -0
- data/lib/kabutops/adapters/elastic_search.rb +1 -1
- data/lib/kabutops/crawler.rb +32 -12
- data/lib/kabutops/crawler_extensions/debugging.rb +4 -0
- data/lib/kabutops/extensions/callback_support.rb +38 -0
- data/lib/kabutops/extensions/parameterable.rb +37 -0
- data/lib/kabutops/version.rb +1 -1
- metadata +3 -6
- data/lib/kabutops/adapters/callback.rb +0 -15
- data/lib/kabutops/adapters/mysql.rb +0 -17
- data/lib/kabutops/crawler_extensions/callback.rb +0 -24
- data/lib/kabutops/crawler_extensions/mysql.rb +0 -21
- data/lib/kabutops/parameterable.rb +0 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29f7dfcb4dbed7228907cc636a82c97005144881
|
4
|
+
data.tar.gz: 9d18c9e2c070289695e1bf7f0c7f5ca773263c22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f1961c5a5b72e3f23309f995865281875a96111f05eaa57343f669e1511c89957914e22d6d58b4919610d00a987ac087b040c7e086d44f27abdb3bc8a5fe61e
|
7
|
+
data.tar.gz: c3626b5fbe52eac58b8b88b637bcce0181267718bc580ab17a8f7aaf1931df95fb729646f0539c0a2da044ed4a40708e87eb58307b6e34f5ba19d592f3d9faf7
|
data/README.md
CHANGED
@@ -13,76 +13,105 @@ gem install kabutops
|
|
13
13
|
Or you can put it in your Gemfile
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem 'kabutops', '~> 0.0.
|
16
|
+
gem 'kabutops', '~> 0.0.3'
|
17
17
|
```
|
18
18
|
|
19
19
|
Basic example
|
20
20
|
-------------
|
21
21
|
|
22
|
-
|
22
|
+
Example that will crawl information about gems that start on letter Q or
|
23
|
+
X and save them to the ElasticSearch.
|
23
24
|
|
24
25
|
```ruby
|
25
26
|
require 'kabutops'
|
26
27
|
|
27
|
-
class
|
28
|
-
|
29
|
-
|
30
|
-
collection (1..5).map { |id|
|
28
|
+
class GemListCrawler < Kabutops::Crawler
|
29
|
+
# just two letters with the smallest amount of gems
|
30
|
+
collection ['Q', 'X'].map{ |letter|
|
31
31
|
{
|
32
|
-
|
33
|
-
url: "https://
|
32
|
+
letter: letter,
|
33
|
+
url: "https://rubygems.org/gems?letter=#{letter}"
|
34
34
|
}
|
35
|
-
}
|
35
|
+
}
|
36
36
|
|
37
|
-
proxy '127.0.0.1', 81818
|
38
37
|
cache true
|
38
|
+
wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
|
39
|
+
|
40
|
+
callbacks do
|
41
|
+
after_crawl do |resource, page|
|
42
|
+
links = page.xpath("//a[contains(@href, '/gems?letter=#{resource[:letter]}')]")
|
43
|
+
links.each do |link|
|
44
|
+
self << {
|
45
|
+
letter: resource[:letter],
|
46
|
+
url: "https://rubygems.org#{link['href']}",
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
links = page.xpath("//a[contains(@href, '/gems/')]")
|
51
|
+
links.each do |link|
|
52
|
+
GemCrawler << {
|
53
|
+
letter: resource[:letter],
|
54
|
+
url: "https://rubygems.org#{link['href']}",
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class GemCrawler < Kabutops::Crawler
|
62
|
+
cache true
|
63
|
+
wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
|
39
64
|
|
40
65
|
elasticsearch do
|
41
|
-
index :
|
42
|
-
type :
|
66
|
+
index :gems
|
67
|
+
type :gem
|
43
68
|
|
44
69
|
data do
|
45
|
-
id :
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
70
|
+
id :css, '.title > h2 > a'
|
71
|
+
title :css, '.title > h2 > a'
|
72
|
+
authors :css, '.authors > p'
|
73
|
+
description :css, '#markup > p'
|
74
|
+
|
75
|
+
downloads do
|
76
|
+
total :lambda, ->(resource, page) {
|
77
|
+
page.css('.downloads.counter > span > strong')[0].text.gsub(',', '').to_i
|
78
|
+
}
|
79
|
+
|
80
|
+
current_version :lambda, ->(resource, page) {
|
81
|
+
page.css('.downloads.counter > span > strong')[1].text.gsub(',', '').to_i
|
82
|
+
}
|
55
83
|
end
|
56
84
|
end
|
57
|
-
end
|
58
85
|
|
59
|
-
|
86
|
+
callbacks do
|
87
|
+
after_save do |hash|
|
88
|
+
puts "#{hash[:title]} saved!"
|
89
|
+
end
|
90
|
+
end
|
60
91
|
end
|
61
92
|
end
|
62
93
|
|
63
|
-
|
94
|
+
GemListCrawler.crawl!
|
95
|
+
GemCrawler.crawl!
|
64
96
|
```
|
65
97
|
|
66
98
|
Run it via sidekiq
|
67
99
|
|
68
100
|
```bash
|
69
|
-
bundle exec sidekiq -r ./
|
101
|
+
bundle exec sidekiq -r ./rubygems_crawler.rb -c 1
|
70
102
|
```
|
71
103
|
|
72
|
-
|
73
|
-
stored to the ElasticSearch index named books as a book document.
|
74
|
-
|
75
|
-
One document will look something like this
|
104
|
+
Documents saved in the ElasticSearch will look like this one
|
76
105
|
|
77
106
|
```json
|
78
107
|
{
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
108
|
+
"id": "qiita_mail",
|
109
|
+
"title": "qiita_mail",
|
110
|
+
"authors": "ongaeshi",
|
111
|
+
"description":" Write a gem description",
|
112
|
+
"downloads": {
|
113
|
+
"total": 2493,
|
114
|
+
"current_version": 580
|
86
115
|
}
|
87
116
|
}
|
88
117
|
```
|
@@ -101,6 +130,7 @@ FruitCrawler.debug_random # will take random one
|
|
101
130
|
FruitCrawler.debug_random 3 # will take 3 random resources
|
102
131
|
FruitCrawler.debug_last # will take last from collection
|
103
132
|
FruitCrawler.debug_last 5 # will take last 5 resources
|
133
|
+
FruitCrawler.debug_all # guess what it will do
|
104
134
|
FruitCrawler.debug_resource { id: '123', url: '...' }
|
105
135
|
```
|
106
136
|
|
data/lib/kabutops.rb
CHANGED
@@ -3,21 +3,19 @@ require 'sidekiq'
|
|
3
3
|
require 'cachy'
|
4
4
|
require 'moneta'
|
5
5
|
require 'pstore'
|
6
|
+
require 'mechanize'
|
6
7
|
require 'elasticsearch'
|
7
8
|
|
8
9
|
Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
|
9
10
|
|
10
|
-
require 'kabutops/parameterable'
|
11
|
+
require 'kabutops/extensions/parameterable'
|
12
|
+
require 'kabutops/extensions/callback_support'
|
11
13
|
require 'kabutops/recipe'
|
12
14
|
require 'kabutops/recipe_item'
|
13
15
|
require 'kabutops/adapters/base'
|
14
|
-
require 'kabutops/adapters/callback'
|
15
16
|
require 'kabutops/adapters/database_adapter'
|
16
17
|
require 'kabutops/adapters/elastic_search'
|
17
|
-
require 'kabutops/adapters/mysql'
|
18
|
-
require 'kabutops/crawler_extensions/callback'
|
19
18
|
require 'kabutops/crawler_extensions/elastic_search'
|
20
|
-
require 'kabutops/crawler_extensions/mysql'
|
21
19
|
require 'kabutops/crawler_extensions/pstore_storage'
|
22
20
|
require 'kabutops/crawler_extensions/debugging'
|
23
21
|
require 'kabutops/crawler'
|
@@ -3,6 +3,8 @@ module Kabutops
|
|
3
3
|
module Adapters
|
4
4
|
|
5
5
|
class DatabaseAdapter < Base
|
6
|
+
include Extensions::CallbackSupport
|
7
|
+
|
6
8
|
def data &block
|
7
9
|
@recipe = Recipe.new
|
8
10
|
@recipe.instance_eval &block
|
@@ -15,6 +17,7 @@ module Kabutops
|
|
15
17
|
p result.to_hash
|
16
18
|
else
|
17
19
|
store(result)
|
20
|
+
notify(:after_save, result)
|
18
21
|
end
|
19
22
|
end
|
20
23
|
|
data/lib/kabutops/crawler.rb
CHANGED
@@ -3,30 +3,46 @@ module Kabutops
|
|
3
3
|
class Crawler
|
4
4
|
include CrawlerExtensions::Debugging
|
5
5
|
include CrawlerExtensions::PStoreStorage
|
6
|
-
include CrawlerExtensions::Callback
|
7
6
|
include CrawlerExtensions::ElasticSearch
|
7
|
+
include Sidekiq::Worker
|
8
8
|
|
9
9
|
class << self
|
10
|
-
include Parameterable
|
10
|
+
include Extensions::Parameterable
|
11
|
+
include Extensions::CallbackSupport
|
11
12
|
|
12
|
-
params :collection, :proxy, :cache
|
13
|
+
params :collection, :proxy, :cache, :wait
|
13
14
|
|
14
15
|
def adapters
|
15
|
-
@adapters
|
16
|
+
@adapters || []
|
16
17
|
end
|
17
18
|
|
18
19
|
def crawl! collection=nil
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
@map ||= {}
|
21
|
+
|
22
|
+
if storage[:status].nil?
|
23
|
+
(collection || params[:collection] || []).each do |resource|
|
24
|
+
self << resource
|
24
25
|
end
|
25
26
|
end
|
26
27
|
end
|
27
28
|
|
28
29
|
def << resource
|
29
|
-
|
30
|
+
if debug
|
31
|
+
params[:collection] ||= []
|
32
|
+
params[:collection] << resource
|
33
|
+
return
|
34
|
+
end
|
35
|
+
|
36
|
+
key = resource[:id] || resource[:url]
|
37
|
+
|
38
|
+
if key.nil?
|
39
|
+
raise "url must be specified for resource"
|
40
|
+
elsif @map[key]
|
41
|
+
# resource with an id already in map
|
42
|
+
else
|
43
|
+
perform_async(resource.to_hash)
|
44
|
+
@map[key] = resource
|
45
|
+
end
|
30
46
|
end
|
31
47
|
end
|
32
48
|
|
@@ -35,19 +51,23 @@ module Kabutops
|
|
35
51
|
|
36
52
|
content = Cachy.cache_if(self.class.params.cache, resource[:url]) do
|
37
53
|
agent = Mechanize.new
|
38
|
-
|
54
|
+
agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
|
39
55
|
agent.get(resource[:url]).body
|
40
56
|
end
|
41
57
|
|
42
58
|
page = Nokogiri::HTML(content)
|
43
59
|
|
60
|
+
self.class.notify(:after_crawl, resource, page)
|
61
|
+
|
44
62
|
self.class.adapters.each do |adapter|
|
45
63
|
adapter.process(resource, page)
|
46
64
|
end
|
65
|
+
|
66
|
+
sleep self.class.params[:wait] || 0
|
47
67
|
end
|
48
68
|
|
49
69
|
def << resource
|
50
|
-
self.class
|
70
|
+
self.class << resource
|
51
71
|
end
|
52
72
|
end
|
53
73
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Kabutops
|
2
|
+
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
module CallbackSupport
|
6
|
+
|
7
|
+
class Manager
|
8
|
+
def method_missing name, *args, &block
|
9
|
+
return unless block_given?
|
10
|
+
|
11
|
+
@map ||= Hashie::Mash.new
|
12
|
+
@map[name] ||= []
|
13
|
+
@map[name] << block
|
14
|
+
end
|
15
|
+
|
16
|
+
def notify name, *args
|
17
|
+
return unless @map
|
18
|
+
|
19
|
+
(@map[name] || []).map do |block|
|
20
|
+
block.call(*args)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def callbacks &block
|
26
|
+
@manager ||= Manager.new
|
27
|
+
@manager.instance_eval &block
|
28
|
+
end
|
29
|
+
|
30
|
+
def notify name, *args
|
31
|
+
@manager ||= Manager.new
|
32
|
+
@manager.notify(name, *args)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Kabutops
|
2
|
+
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
module Parameterable
|
6
|
+
|
7
|
+
def self.included base
|
8
|
+
base.extend(ClassMethods)
|
9
|
+
base.class_eval do
|
10
|
+
attr_reader :params
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
|
16
|
+
def params *list
|
17
|
+
return @params if list.empty?
|
18
|
+
|
19
|
+
list.each do |name|
|
20
|
+
define_method name do |*args|
|
21
|
+
@params ||= Hashie::Mash.new
|
22
|
+
if args.size == 1
|
23
|
+
@params[name] = args[0]
|
24
|
+
else
|
25
|
+
@params[name] = args
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
@@ -132,17 +132,14 @@ files:
|
|
132
132
|
- README.md
|
133
133
|
- lib/kabutops.rb
|
134
134
|
- lib/kabutops/adapters/base.rb
|
135
|
-
- lib/kabutops/adapters/callback.rb
|
136
135
|
- lib/kabutops/adapters/database_adapter.rb
|
137
136
|
- lib/kabutops/adapters/elastic_search.rb
|
138
|
-
- lib/kabutops/adapters/mysql.rb
|
139
137
|
- lib/kabutops/crawler.rb
|
140
|
-
- lib/kabutops/crawler_extensions/callback.rb
|
141
138
|
- lib/kabutops/crawler_extensions/debugging.rb
|
142
139
|
- lib/kabutops/crawler_extensions/elastic_search.rb
|
143
|
-
- lib/kabutops/crawler_extensions/mysql.rb
|
144
140
|
- lib/kabutops/crawler_extensions/pstore_storage.rb
|
145
|
-
- lib/kabutops/
|
141
|
+
- lib/kabutops/extensions/callback_support.rb
|
142
|
+
- lib/kabutops/extensions/parameterable.rb
|
146
143
|
- lib/kabutops/recipe.rb
|
147
144
|
- lib/kabutops/recipe_item.rb
|
148
145
|
- lib/kabutops/version.rb
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module Kabutops
|
2
|
-
|
3
|
-
module CrawlerExtensions
|
4
|
-
|
5
|
-
module Callback
|
6
|
-
|
7
|
-
def self.included base
|
8
|
-
base.extend(ClassMethods)
|
9
|
-
end
|
10
|
-
|
11
|
-
module ClassMethods
|
12
|
-
def callback &block
|
13
|
-
adapter = Adapters::Callback.new(block)
|
14
|
-
|
15
|
-
@adapters ||= []
|
16
|
-
@adapters << adapter
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
module Kabutops
|
2
|
-
|
3
|
-
module Parameterable
|
4
|
-
|
5
|
-
def self.included base
|
6
|
-
base.extend(ClassMethods)
|
7
|
-
base.class_eval do
|
8
|
-
attr_reader :params
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
module ClassMethods
|
13
|
-
|
14
|
-
def params *list
|
15
|
-
return @params if list.empty?
|
16
|
-
|
17
|
-
list.each do |name|
|
18
|
-
define_method name do |*args|
|
19
|
-
@params ||= Hashie::Mash.new
|
20
|
-
if args.size == 1
|
21
|
-
@params[name] = args[0]
|
22
|
-
else
|
23
|
-
@params[name] = args
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|