kabutops 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3558c828ca14af27797b234bab95569992e00f1e
4
- data.tar.gz: 317fec36ba533f78861203c4142e4a7c7d3add84
3
+ metadata.gz: 29f7dfcb4dbed7228907cc636a82c97005144881
4
+ data.tar.gz: 9d18c9e2c070289695e1bf7f0c7f5ca773263c22
5
5
  SHA512:
6
- metadata.gz: 8bf4d1916494080e06c5cda2a830f78857851c1d07228aa271b2c106896b712a54a2ef5747b881cd828d506a03a1f2d8e4db672864f888585cc3780640d317a1
7
- data.tar.gz: efbef6ae896963703a2c39963b55cf57b6729d58ed1b9749d2052a5821e0366420c65b16f0a1256ef267453e2bc57732530b89a777e5ecc9669885b232cb9b9e
6
+ metadata.gz: 9f1961c5a5b72e3f23309f995865281875a96111f05eaa57343f669e1511c89957914e22d6d58b4919610d00a987ac087b040c7e086d44f27abdb3bc8a5fe61e
7
+ data.tar.gz: c3626b5fbe52eac58b8b88b637bcce0181267718bc580ab17a8f7aaf1931df95fb729646f0539c0a2da044ed4a40708e87eb58307b6e34f5ba19d592f3d9faf7
data/README.md CHANGED
@@ -13,76 +13,105 @@ gem install kabutops
13
13
  Or you can put it in your Gemfile
14
14
 
15
15
  ```ruby
16
- gem 'kabutops', '~> 0.0.2'
16
+ gem 'kabutops', '~> 0.0.3'
17
17
  ```
18
18
 
19
19
  Basic example
20
20
  -------------
21
21
 
22
- Create **fruit_crawler.rb**.
22
+ Example that will crawl information about gems that start on letter Q or
23
+ X and save them to the ElasticSearch.
23
24
 
24
25
  ```ruby
25
26
  require 'kabutops'
26
27
 
27
- class FruitCrawler < Kabutops::Crawler
28
- include Sidekiq::Worker
29
-
30
- collection (1..5).map { |id|
28
+ class GemListCrawler < Kabutops::Crawler
29
+ # just two letters with the smallest amount of gems
30
+ collection ['Q', 'X'].map{ |letter|
31
31
  {
32
- id: id,
33
- url: "https://www.example.com/fruits/#{id}",
32
+ letter: letter,
33
+ url: "https://rubygems.org/gems?letter=#{letter}"
34
34
  }
35
- }.shuffle
35
+ }
36
36
 
37
- proxy '127.0.0.1', 81818
38
37
  cache true
38
+ wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
39
+
40
+ callbacks do
41
+ after_crawl do |resource, page|
42
+ links = page.xpath("//a[contains(@href, '/gems?letter=#{resource[:letter]}')]")
43
+ links.each do |link|
44
+ self << {
45
+ letter: resource[:letter],
46
+ url: "https://rubygems.org#{link['href']}",
47
+ }
48
+ end
49
+
50
+ links = page.xpath("//a[contains(@href, '/gems/')]")
51
+ links.each do |link|
52
+ GemCrawler << {
53
+ letter: resource[:letter],
54
+ url: "https://rubygems.org#{link['href']}",
55
+ }
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ class GemCrawler < Kabutops::Crawler
62
+ cache true
63
+ wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
39
64
 
40
65
  elasticsearch do
41
- index :books
42
- type :book
66
+ index :gems
67
+ type :gem
43
68
 
44
69
  data do
45
- id :var, :id
46
- url :var, :url
47
- some_attr :css, 'h1.bookTitle'
48
- grape :lambda, ->(resource, page) {
49
- page.css('h3.fruit').split(',').first
50
- }
51
-
52
- nested_attr do
53
- apple :css, 'h1.bookTitle'
54
- banana :xpath, '//table/tr/td[0]'
70
+ id :css, '.title > h2 > a'
71
+ title :css, '.title > h2 > a'
72
+ authors :css, '.authors > p'
73
+ description :css, '#markup > p'
74
+
75
+ downloads do
76
+ total :lambda, ->(resource, page) {
77
+ page.css('.downloads.counter > span > strong')[0].text.gsub(',', '').to_i
78
+ }
79
+
80
+ current_version :lambda, ->(resource, page) {
81
+ page.css('.downloads.counter > span > strong')[1].text.gsub(',', '').to_i
82
+ }
55
83
  end
56
84
  end
57
- end
58
85
 
59
- callback do |resource, page|
86
+ callbacks do
87
+ after_save do |hash|
88
+ puts "#{hash[:title]} saved!"
89
+ end
90
+ end
60
91
  end
61
92
  end
62
93
 
63
- FruitCrawler.crawl!
94
+ GemListCrawler.crawl!
95
+ GemCrawler.crawl!
64
96
  ```
65
97
 
66
98
  Run it via sidekiq
67
99
 
68
100
  ```bash
69
- bundle exec sidekiq -r ./fruit_crawler.rb -c 10
101
+ bundle exec sidekiq -r ./rubygems_crawler.rb -c 1
70
102
  ```
71
103
 
72
- This example will parallely crawl specified urls and result will be
73
- stored to the ElasticSearch index named books as a book document.
74
-
75
- One document will look something like this
104
+ Documents saved in the ElasticSearch will look like this one
76
105
 
77
106
  ```json
78
107
  {
79
- 'id': '...',
80
- 'url': '...',
81
- 'some_attr': '...',
82
- 'grape': '...',
83
- 'nested_attr': {
84
- 'apple': '...',
85
- 'banana': '...'
108
+ "id": "qiita_mail",
109
+ "title": "qiita_mail",
110
+ "authors": "ongaeshi",
111
+ "description":" Write a gem description",
112
+ "downloads": {
113
+ "total": 2493,
114
+ "current_version": 580
86
115
  }
87
116
  }
88
117
  ```
@@ -101,6 +130,7 @@ FruitCrawler.debug_random # will take random one
101
130
  FruitCrawler.debug_random 3 # will take 3 random resources
102
131
  FruitCrawler.debug_last # will take last from collection
103
132
  FruitCrawler.debug_last 5 # will take last 5 resources
133
+ FruitCrawler.debug_all # guess what it will do
104
134
  FruitCrawler.debug_resource { id: '123', url: '...' }
105
135
  ```
106
136
 
@@ -3,21 +3,19 @@ require 'sidekiq'
3
3
  require 'cachy'
4
4
  require 'moneta'
5
5
  require 'pstore'
6
+ require 'mechanize'
6
7
  require 'elasticsearch'
7
8
 
8
9
  Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
9
10
 
10
- require 'kabutops/parameterable'
11
+ require 'kabutops/extensions/parameterable'
12
+ require 'kabutops/extensions/callback_support'
11
13
  require 'kabutops/recipe'
12
14
  require 'kabutops/recipe_item'
13
15
  require 'kabutops/adapters/base'
14
- require 'kabutops/adapters/callback'
15
16
  require 'kabutops/adapters/database_adapter'
16
17
  require 'kabutops/adapters/elastic_search'
17
- require 'kabutops/adapters/mysql'
18
- require 'kabutops/crawler_extensions/callback'
19
18
  require 'kabutops/crawler_extensions/elastic_search'
20
- require 'kabutops/crawler_extensions/mysql'
21
19
  require 'kabutops/crawler_extensions/pstore_storage'
22
20
  require 'kabutops/crawler_extensions/debugging'
23
21
  require 'kabutops/crawler'
@@ -3,6 +3,8 @@ module Kabutops
3
3
  module Adapters
4
4
 
5
5
  class DatabaseAdapter < Base
6
+ include Extensions::CallbackSupport
7
+
6
8
  def data &block
7
9
  @recipe = Recipe.new
8
10
  @recipe.instance_eval &block
@@ -15,6 +17,7 @@ module Kabutops
15
17
  p result.to_hash
16
18
  else
17
19
  store(result)
20
+ notify(:after_save, result)
18
21
  end
19
22
  end
20
23
 
@@ -3,7 +3,7 @@ module Kabutops
3
3
  module Adapters
4
4
 
5
5
  class ElasticSearch < DatabaseAdapter
6
- include Parameterable
6
+ include Extensions::Parameterable
7
7
 
8
8
  params :host, :port, :index, :type
9
9
 
@@ -3,30 +3,46 @@ module Kabutops
3
3
  class Crawler
4
4
  include CrawlerExtensions::Debugging
5
5
  include CrawlerExtensions::PStoreStorage
6
- include CrawlerExtensions::Callback
7
6
  include CrawlerExtensions::ElasticSearch
7
+ include Sidekiq::Worker
8
8
 
9
9
  class << self
10
- include Parameterable
10
+ include Extensions::Parameterable
11
+ include Extensions::CallbackSupport
11
12
 
12
- params :collection, :proxy, :cache
13
+ params :collection, :proxy, :cache, :wait
13
14
 
14
15
  def adapters
15
- @adapters
16
+ @adapters || []
16
17
  end
17
18
 
18
19
  def crawl! collection=nil
19
- if storage[:status] == :none
20
- @collection = collection || params[:collection] || []
21
- @collection.each do |resource|
22
- raise "url must be specified" if resource[:id].nil?
23
- perform_async(resource)
20
+ @map ||= {}
21
+
22
+ if storage[:status].nil?
23
+ (collection || params[:collection] || []).each do |resource|
24
+ self << resource
24
25
  end
25
26
  end
26
27
  end
27
28
 
28
29
  def << resource
29
- perform_async(resource)
30
+ if debug
31
+ params[:collection] ||= []
32
+ params[:collection] << resource
33
+ return
34
+ end
35
+
36
+ key = resource[:id] || resource[:url]
37
+
38
+ if key.nil?
39
+ raise "url must be specified for resource"
40
+ elsif @map[key]
41
+ # resource with an id already in map
42
+ else
43
+ perform_async(resource.to_hash)
44
+ @map[key] = resource
45
+ end
30
46
  end
31
47
  end
32
48
 
@@ -35,19 +51,23 @@ module Kabutops
35
51
 
36
52
  content = Cachy.cache_if(self.class.params.cache, resource[:url]) do
37
53
  agent = Mechanize.new
38
- #agent.set_proxy(*self.class.params[:proxy])
54
+ agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
39
55
  agent.get(resource[:url]).body
40
56
  end
41
57
 
42
58
  page = Nokogiri::HTML(content)
43
59
 
60
+ self.class.notify(:after_crawl, resource, page)
61
+
44
62
  self.class.adapters.each do |adapter|
45
63
  adapter.process(resource, page)
46
64
  end
65
+
66
+ sleep self.class.params[:wait] || 0
47
67
  end
48
68
 
49
69
  def << resource
50
- self.class.perform_async(resource.to_hash)
70
+ self.class << resource
51
71
  end
52
72
  end
53
73
 
@@ -22,6 +22,10 @@ module Kabutops
22
22
  params[:collection][(0 - count)..-1].map{ |r| debug_resource(r) }
23
23
  end
24
24
 
25
+ def debug_all
26
+ params[:collection].map{ |r| debug_resource(r) }
27
+ end
28
+
25
29
  def debug_resource resource
26
30
  enable_debug
27
31
  self.new.perform(resource)
@@ -0,0 +1,38 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ module CallbackSupport
6
+
7
+ class Manager
8
+ def method_missing name, *args, &block
9
+ return unless block_given?
10
+
11
+ @map ||= Hashie::Mash.new
12
+ @map[name] ||= []
13
+ @map[name] << block
14
+ end
15
+
16
+ def notify name, *args
17
+ return unless @map
18
+
19
+ (@map[name] || []).map do |block|
20
+ block.call(*args)
21
+ end
22
+ end
23
+ end
24
+
25
+ def callbacks &block
26
+ @manager ||= Manager.new
27
+ @manager.instance_eval &block
28
+ end
29
+
30
+ def notify name, *args
31
+ @manager ||= Manager.new
32
+ @manager.notify(name, *args)
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,37 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ module Parameterable
6
+
7
+ def self.included base
8
+ base.extend(ClassMethods)
9
+ base.class_eval do
10
+ attr_reader :params
11
+ end
12
+ end
13
+
14
+ module ClassMethods
15
+
16
+ def params *list
17
+ return @params if list.empty?
18
+
19
+ list.each do |name|
20
+ define_method name do |*args|
21
+ @params ||= Hashie::Mash.new
22
+ if args.size == 1
23
+ @params[name] = args[0]
24
+ else
25
+ @params[name] = args
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -1,3 +1,3 @@
1
1
  module Kabutops
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
@@ -132,17 +132,14 @@ files:
132
132
  - README.md
133
133
  - lib/kabutops.rb
134
134
  - lib/kabutops/adapters/base.rb
135
- - lib/kabutops/adapters/callback.rb
136
135
  - lib/kabutops/adapters/database_adapter.rb
137
136
  - lib/kabutops/adapters/elastic_search.rb
138
- - lib/kabutops/adapters/mysql.rb
139
137
  - lib/kabutops/crawler.rb
140
- - lib/kabutops/crawler_extensions/callback.rb
141
138
  - lib/kabutops/crawler_extensions/debugging.rb
142
139
  - lib/kabutops/crawler_extensions/elastic_search.rb
143
- - lib/kabutops/crawler_extensions/mysql.rb
144
140
  - lib/kabutops/crawler_extensions/pstore_storage.rb
145
- - lib/kabutops/parameterable.rb
141
+ - lib/kabutops/extensions/callback_support.rb
142
+ - lib/kabutops/extensions/parameterable.rb
146
143
  - lib/kabutops/recipe.rb
147
144
  - lib/kabutops/recipe_item.rb
148
145
  - lib/kabutops/version.rb
@@ -1,15 +0,0 @@
1
- module Kabutops
2
- module Adapters
3
- class Callback < Base
4
- attr_accessor :block
5
-
6
- def initialize block
7
- @block = block
8
- end
9
-
10
- def process resource, page
11
- block.call(resource, page)
12
- end
13
- end
14
- end
15
- end
@@ -1,17 +0,0 @@
1
- module Kabutops
2
-
3
- module Adapters
4
-
5
- class MySQL < DatabaseAdapter
6
- include Parameterable
7
-
8
- params :host, :port, :database, :user, :password, :table
9
-
10
- def nested?
11
- false
12
- end
13
- end
14
-
15
- end
16
-
17
- end
@@ -1,24 +0,0 @@
1
- module Kabutops
2
-
3
- module CrawlerExtensions
4
-
5
- module Callback
6
-
7
- def self.included base
8
- base.extend(ClassMethods)
9
- end
10
-
11
- module ClassMethods
12
- def callback &block
13
- adapter = Adapters::Callback.new(block)
14
-
15
- @adapters ||= []
16
- @adapters << adapter
17
- end
18
- end
19
-
20
- end
21
-
22
- end
23
-
24
- end
@@ -1,21 +0,0 @@
1
- module Kabutops
2
-
3
- module CrawlerExtensions
4
-
5
- module Mysql
6
-
7
- def self.included base
8
- base.extend(ClassMethods)
9
- end
10
-
11
- module ClassMethods
12
- def mysql
13
- raise NotImplementedError
14
- end
15
- end
16
-
17
- end
18
-
19
- end
20
-
21
- end
@@ -1,33 +0,0 @@
1
- module Kabutops
2
-
3
- module Parameterable
4
-
5
- def self.included base
6
- base.extend(ClassMethods)
7
- base.class_eval do
8
- attr_reader :params
9
- end
10
- end
11
-
12
- module ClassMethods
13
-
14
- def params *list
15
- return @params if list.empty?
16
-
17
- list.each do |name|
18
- define_method name do |*args|
19
- @params ||= Hashie::Mash.new
20
- if args.size == 1
21
- @params[name] = args[0]
22
- else
23
- @params[name] = args
24
- end
25
- end
26
- end
27
- end
28
-
29
- end
30
-
31
- end
32
-
33
- end