kabutops 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3558c828ca14af27797b234bab95569992e00f1e
4
- data.tar.gz: 317fec36ba533f78861203c4142e4a7c7d3add84
3
+ metadata.gz: 29f7dfcb4dbed7228907cc636a82c97005144881
4
+ data.tar.gz: 9d18c9e2c070289695e1bf7f0c7f5ca773263c22
5
5
  SHA512:
6
- metadata.gz: 8bf4d1916494080e06c5cda2a830f78857851c1d07228aa271b2c106896b712a54a2ef5747b881cd828d506a03a1f2d8e4db672864f888585cc3780640d317a1
7
- data.tar.gz: efbef6ae896963703a2c39963b55cf57b6729d58ed1b9749d2052a5821e0366420c65b16f0a1256ef267453e2bc57732530b89a777e5ecc9669885b232cb9b9e
6
+ metadata.gz: 9f1961c5a5b72e3f23309f995865281875a96111f05eaa57343f669e1511c89957914e22d6d58b4919610d00a987ac087b040c7e086d44f27abdb3bc8a5fe61e
7
+ data.tar.gz: c3626b5fbe52eac58b8b88b637bcce0181267718bc580ab17a8f7aaf1931df95fb729646f0539c0a2da044ed4a40708e87eb58307b6e34f5ba19d592f3d9faf7
data/README.md CHANGED
@@ -13,76 +13,105 @@ gem install kabutops
13
13
  Or you can put it in your Gemfile
14
14
 
15
15
  ```ruby
16
- gem 'kabutops', '~> 0.0.2'
16
+ gem 'kabutops', '~> 0.0.3'
17
17
  ```
18
18
 
19
19
  Basic example
20
20
  -------------
21
21
 
22
- Create **fruit_crawler.rb**.
22
+ Example that will crawl information about gems that start on letter Q or
23
+ X and save them to the ElasticSearch.
23
24
 
24
25
  ```ruby
25
26
  require 'kabutops'
26
27
 
27
- class FruitCrawler < Kabutops::Crawler
28
- include Sidekiq::Worker
29
-
30
- collection (1..5).map { |id|
28
+ class GemListCrawler < Kabutops::Crawler
29
+ # just two letters with the smallest amount of gems
30
+ collection ['Q', 'X'].map{ |letter|
31
31
  {
32
- id: id,
33
- url: "https://www.example.com/fruits/#{id}",
32
+ letter: letter,
33
+ url: "https://rubygems.org/gems?letter=#{letter}"
34
34
  }
35
- }.shuffle
35
+ }
36
36
 
37
- proxy '127.0.0.1', 81818
38
37
  cache true
38
+ wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
39
+
40
+ callbacks do
41
+ after_crawl do |resource, page|
42
+ links = page.xpath("//a[contains(@href, '/gems?letter=#{resource[:letter]}')]")
43
+ links.each do |link|
44
+ self << {
45
+ letter: resource[:letter],
46
+ url: "https://rubygems.org#{link['href']}",
47
+ }
48
+ end
49
+
50
+ links = page.xpath("//a[contains(@href, '/gems/')]")
51
+ links.each do |link|
52
+ GemCrawler << {
53
+ letter: resource[:letter],
54
+ url: "https://rubygems.org#{link['href']}",
55
+ }
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ class GemCrawler < Kabutops::Crawler
62
+ cache true
63
+ wait 2 # wait two seconds after each procession (we do not want to hurt rubygems)
39
64
 
40
65
  elasticsearch do
41
- index :books
42
- type :book
66
+ index :gems
67
+ type :gem
43
68
 
44
69
  data do
45
- id :var, :id
46
- url :var, :url
47
- some_attr :css, 'h1.bookTitle'
48
- grape :lambda, ->(resource, page) {
49
- page.css('h3.fruit').split(',').first
50
- }
51
-
52
- nested_attr do
53
- apple :css, 'h1.bookTitle'
54
- banana :xpath, '//table/tr/td[0]'
70
+ id :css, '.title > h2 > a'
71
+ title :css, '.title > h2 > a'
72
+ authors :css, '.authors > p'
73
+ description :css, '#markup > p'
74
+
75
+ downloads do
76
+ total :lambda, ->(resource, page) {
77
+ page.css('.downloads.counter > span > strong')[0].text.gsub(',', '').to_i
78
+ }
79
+
80
+ current_version :lambda, ->(resource, page) {
81
+ page.css('.downloads.counter > span > strong')[1].text.gsub(',', '').to_i
82
+ }
55
83
  end
56
84
  end
57
- end
58
85
 
59
- callback do |resource, page|
86
+ callbacks do
87
+ after_save do |hash|
88
+ puts "#{hash[:title]} saved!"
89
+ end
90
+ end
60
91
  end
61
92
  end
62
93
 
63
- FruitCrawler.crawl!
94
+ GemListCrawler.crawl!
95
+ GemCrawler.crawl!
64
96
  ```
65
97
 
66
98
  Run it via sidekiq
67
99
 
68
100
  ```bash
69
- bundle exec sidekiq -r ./fruit_crawler.rb -c 10
101
+ bundle exec sidekiq -r ./rubygems_crawler.rb -c 1
70
102
  ```
71
103
 
72
- This example will parallely crawl specified urls and result will be
73
- stored to the ElasticSearch index named books as a book document.
74
-
75
- One document will look something like this
104
+ Documents saved in the ElasticSearch will look like this one
76
105
 
77
106
  ```json
78
107
  {
79
- 'id': '...',
80
- 'url': '...',
81
- 'some_attr': '...',
82
- 'grape': '...',
83
- 'nested_attr': {
84
- 'apple': '...',
85
- 'banana': '...'
108
+ "id": "qiita_mail",
109
+ "title": "qiita_mail",
110
+ "authors": "ongaeshi",
111
+ "description":" Write a gem description",
112
+ "downloads": {
113
+ "total": 2493,
114
+ "current_version": 580
86
115
  }
87
116
  }
88
117
  ```
@@ -101,6 +130,7 @@ FruitCrawler.debug_random # will take random one
101
130
  FruitCrawler.debug_random 3 # will take 3 random resources
102
131
  FruitCrawler.debug_last # will take last from collection
103
132
  FruitCrawler.debug_last 5 # will take last 5 resources
133
+ FruitCrawler.debug_all # guess what it will do
104
134
  FruitCrawler.debug_resource { id: '123', url: '...' }
105
135
  ```
106
136
 
@@ -3,21 +3,19 @@ require 'sidekiq'
3
3
  require 'cachy'
4
4
  require 'moneta'
5
5
  require 'pstore'
6
+ require 'mechanize'
6
7
  require 'elasticsearch'
7
8
 
8
9
  Cachy.cache_store = Moneta.new(:File, dir: 'cache') # temporary
9
10
 
10
- require 'kabutops/parameterable'
11
+ require 'kabutops/extensions/parameterable'
12
+ require 'kabutops/extensions/callback_support'
11
13
  require 'kabutops/recipe'
12
14
  require 'kabutops/recipe_item'
13
15
  require 'kabutops/adapters/base'
14
- require 'kabutops/adapters/callback'
15
16
  require 'kabutops/adapters/database_adapter'
16
17
  require 'kabutops/adapters/elastic_search'
17
- require 'kabutops/adapters/mysql'
18
- require 'kabutops/crawler_extensions/callback'
19
18
  require 'kabutops/crawler_extensions/elastic_search'
20
- require 'kabutops/crawler_extensions/mysql'
21
19
  require 'kabutops/crawler_extensions/pstore_storage'
22
20
  require 'kabutops/crawler_extensions/debugging'
23
21
  require 'kabutops/crawler'
@@ -3,6 +3,8 @@ module Kabutops
3
3
  module Adapters
4
4
 
5
5
  class DatabaseAdapter < Base
6
+ include Extensions::CallbackSupport
7
+
6
8
  def data &block
7
9
  @recipe = Recipe.new
8
10
  @recipe.instance_eval &block
@@ -15,6 +17,7 @@ module Kabutops
15
17
  p result.to_hash
16
18
  else
17
19
  store(result)
20
+ notify(:after_save, result)
18
21
  end
19
22
  end
20
23
 
@@ -3,7 +3,7 @@ module Kabutops
3
3
  module Adapters
4
4
 
5
5
  class ElasticSearch < DatabaseAdapter
6
- include Parameterable
6
+ include Extensions::Parameterable
7
7
 
8
8
  params :host, :port, :index, :type
9
9
 
@@ -3,30 +3,46 @@ module Kabutops
3
3
  class Crawler
4
4
  include CrawlerExtensions::Debugging
5
5
  include CrawlerExtensions::PStoreStorage
6
- include CrawlerExtensions::Callback
7
6
  include CrawlerExtensions::ElasticSearch
7
+ include Sidekiq::Worker
8
8
 
9
9
  class << self
10
- include Parameterable
10
+ include Extensions::Parameterable
11
+ include Extensions::CallbackSupport
11
12
 
12
- params :collection, :proxy, :cache
13
+ params :collection, :proxy, :cache, :wait
13
14
 
14
15
  def adapters
15
- @adapters
16
+ @adapters || []
16
17
  end
17
18
 
18
19
  def crawl! collection=nil
19
- if storage[:status] == :none
20
- @collection = collection || params[:collection] || []
21
- @collection.each do |resource|
22
- raise "url must be specified" if resource[:id].nil?
23
- perform_async(resource)
20
+ @map ||= {}
21
+
22
+ if storage[:status].nil?
23
+ (collection || params[:collection] || []).each do |resource|
24
+ self << resource
24
25
  end
25
26
  end
26
27
  end
27
28
 
28
29
  def << resource
29
- perform_async(resource)
30
+ if debug
31
+ params[:collection] ||= []
32
+ params[:collection] << resource
33
+ return
34
+ end
35
+
36
+ key = resource[:id] || resource[:url]
37
+
38
+ if key.nil?
39
+ raise "url must be specified for resource"
40
+ elsif @map[key]
41
+ # resource with an id already in map
42
+ else
43
+ perform_async(resource.to_hash)
44
+ @map[key] = resource
45
+ end
30
46
  end
31
47
  end
32
48
 
@@ -35,19 +51,23 @@ module Kabutops
35
51
 
36
52
  content = Cachy.cache_if(self.class.params.cache, resource[:url]) do
37
53
  agent = Mechanize.new
38
- #agent.set_proxy(*self.class.params[:proxy])
54
+ agent.set_proxy(*self.class.params[:proxy]) if self.class.params[:proxy]
39
55
  agent.get(resource[:url]).body
40
56
  end
41
57
 
42
58
  page = Nokogiri::HTML(content)
43
59
 
60
+ self.class.notify(:after_crawl, resource, page)
61
+
44
62
  self.class.adapters.each do |adapter|
45
63
  adapter.process(resource, page)
46
64
  end
65
+
66
+ sleep self.class.params[:wait] || 0
47
67
  end
48
68
 
49
69
  def << resource
50
- self.class.perform_async(resource.to_hash)
70
+ self.class << resource
51
71
  end
52
72
  end
53
73
 
@@ -22,6 +22,10 @@ module Kabutops
22
22
  params[:collection][(0 - count)..-1].map{ |r| debug_resource(r) }
23
23
  end
24
24
 
25
+ def debug_all
26
+ params[:collection].map{ |r| debug_resource(r) }
27
+ end
28
+
25
29
  def debug_resource resource
26
30
  enable_debug
27
31
  self.new.perform(resource)
@@ -0,0 +1,38 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ module CallbackSupport
6
+
7
+ class Manager
8
+ def method_missing name, *args, &block
9
+ return unless block_given?
10
+
11
+ @map ||= Hashie::Mash.new
12
+ @map[name] ||= []
13
+ @map[name] << block
14
+ end
15
+
16
+ def notify name, *args
17
+ return unless @map
18
+
19
+ (@map[name] || []).map do |block|
20
+ block.call(*args)
21
+ end
22
+ end
23
+ end
24
+
25
+ def callbacks &block
26
+ @manager ||= Manager.new
27
+ @manager.instance_eval &block
28
+ end
29
+
30
+ def notify name, *args
31
+ @manager ||= Manager.new
32
+ @manager.notify(name, *args)
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,37 @@
1
+ module Kabutops
2
+
3
+ module Extensions
4
+
5
+ module Parameterable
6
+
7
+ def self.included base
8
+ base.extend(ClassMethods)
9
+ base.class_eval do
10
+ attr_reader :params
11
+ end
12
+ end
13
+
14
+ module ClassMethods
15
+
16
+ def params *list
17
+ return @params if list.empty?
18
+
19
+ list.each do |name|
20
+ define_method name do |*args|
21
+ @params ||= Hashie::Mash.new
22
+ if args.size == 1
23
+ @params[name] = args[0]
24
+ else
25
+ @params[name] = args
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -1,3 +1,3 @@
1
1
  module Kabutops
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
@@ -132,17 +132,14 @@ files:
132
132
  - README.md
133
133
  - lib/kabutops.rb
134
134
  - lib/kabutops/adapters/base.rb
135
- - lib/kabutops/adapters/callback.rb
136
135
  - lib/kabutops/adapters/database_adapter.rb
137
136
  - lib/kabutops/adapters/elastic_search.rb
138
- - lib/kabutops/adapters/mysql.rb
139
137
  - lib/kabutops/crawler.rb
140
- - lib/kabutops/crawler_extensions/callback.rb
141
138
  - lib/kabutops/crawler_extensions/debugging.rb
142
139
  - lib/kabutops/crawler_extensions/elastic_search.rb
143
- - lib/kabutops/crawler_extensions/mysql.rb
144
140
  - lib/kabutops/crawler_extensions/pstore_storage.rb
145
- - lib/kabutops/parameterable.rb
141
+ - lib/kabutops/extensions/callback_support.rb
142
+ - lib/kabutops/extensions/parameterable.rb
146
143
  - lib/kabutops/recipe.rb
147
144
  - lib/kabutops/recipe_item.rb
148
145
  - lib/kabutops/version.rb
@@ -1,15 +0,0 @@
1
- module Kabutops
2
- module Adapters
3
- class Callback < Base
4
- attr_accessor :block
5
-
6
- def initialize block
7
- @block = block
8
- end
9
-
10
- def process resource, page
11
- block.call(resource, page)
12
- end
13
- end
14
- end
15
- end
@@ -1,17 +0,0 @@
1
- module Kabutops
2
-
3
- module Adapters
4
-
5
- class MySQL < DatabaseAdapter
6
- include Parameterable
7
-
8
- params :host, :port, :database, :user, :password, :table
9
-
10
- def nested?
11
- false
12
- end
13
- end
14
-
15
- end
16
-
17
- end
@@ -1,24 +0,0 @@
1
- module Kabutops
2
-
3
- module CrawlerExtensions
4
-
5
- module Callback
6
-
7
- def self.included base
8
- base.extend(ClassMethods)
9
- end
10
-
11
- module ClassMethods
12
- def callback &block
13
- adapter = Adapters::Callback.new(block)
14
-
15
- @adapters ||= []
16
- @adapters << adapter
17
- end
18
- end
19
-
20
- end
21
-
22
- end
23
-
24
- end
@@ -1,21 +0,0 @@
1
- module Kabutops
2
-
3
- module CrawlerExtensions
4
-
5
- module Mysql
6
-
7
- def self.included base
8
- base.extend(ClassMethods)
9
- end
10
-
11
- module ClassMethods
12
- def mysql
13
- raise NotImplementedError
14
- end
15
- end
16
-
17
- end
18
-
19
- end
20
-
21
- end
@@ -1,33 +0,0 @@
1
- module Kabutops
2
-
3
- module Parameterable
4
-
5
- def self.included base
6
- base.extend(ClassMethods)
7
- base.class_eval do
8
- attr_reader :params
9
- end
10
- end
11
-
12
- module ClassMethods
13
-
14
- def params *list
15
- return @params if list.empty?
16
-
17
- list.each do |name|
18
- define_method name do |*args|
19
- @params ||= Hashie::Mash.new
20
- if args.size == 1
21
- @params[name] = args[0]
22
- else
23
- @params[name] = args
24
- end
25
- end
26
- end
27
- end
28
-
29
- end
30
-
31
- end
32
-
33
- end