kabutops 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e931d9a939854fae6497f910de768971907f29a
4
- data.tar.gz: 126e34c2c1eeffa8af3dca362a62381f5889cbe0
3
+ metadata.gz: 87c6590089f52a068980373fdb2b060618b34940
4
+ data.tar.gz: 7505ade7b0104e76c62e73594de557cc75c07944
5
5
  SHA512:
6
- metadata.gz: a869f1388c583d8e2467f096bd6e4702d95832f1f2039253d8b13ec48037e133c9fbbaf68dd44512b69edb4ca4901177757b9ceeedb52efb9aac9587e1ce0a72
7
- data.tar.gz: cccb0968bcf9ce251a7ec81c8a8cbb04b5449d19c96dcf2dd58fd69c259b915f17c9e37d7ed926c229066329018b568d0221824f5ec60578eb61238a8172d4b5
6
+ metadata.gz: a48556126558daa3e7592fcd4eaa7767b8cf3157a4de28e862800783fed992e37dc43995506511063aa15eccf1d6f4fe9b7605fc546c07627f393f57c6c61284
7
+ data.tar.gz: ae9e2711c691fef7135d1e45ea132ff0bd075df7e02c2ee6371251f0388c9b7f73751c539e384d531e9a0fc05be4852fccfe36226477a3b5993ed2dbda9a6861
data/README.md CHANGED
@@ -1,9 +1,9 @@
1
1
  # Kabutops [![Code Climate](https://codeclimate.com/github/reneklacan/kabutops.png)](https://codeclimate.com/github/reneklacan/kabutops) [![Coverage](https://codeclimate.com/github/reneklacan/kabutops/coverage.png)](https://codeclimate.com/github/reneklacan/kabutops)
2
2
 
3
- Kabutops is a ruby library whichs aims to simplify creating website crawlers.
3
+ Kabutops is a ruby library which aims to simplify creating website crawlers.
4
4
  You can define what will be crawled and how it will be saved in the short class definition.
5
5
 
6
- With Kabutops you can easily save data to **ElasticSearch**.
6
+ With Kabutops you can easily save data to **ElasticSearch 2.x**.
7
7
 
8
8
  Example for every kind of database are located
9
9
  in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
@@ -19,7 +19,7 @@ gem install kabutops
19
19
  Or you can put it in your Gemfile
20
20
 
21
21
  ```ruby
22
- gem 'kabutops', '~> 0.1.1'
22
+ gem 'kabutops'
23
23
  ```
24
24
 
25
25
  You will also need Redis database installed and running.
@@ -28,4 +28,3 @@ require 'kabutops/crawler_extensions/pstore_storage'
28
28
  require 'kabutops/crawler_extensions/debugging'
29
29
  require 'kabutops/crawler'
30
30
  require 'kabutops/watchdog'
31
- require 'kabutops/spider'
@@ -5,8 +5,8 @@ module Kabutops
5
5
  module Adapters
6
6
 
7
7
  class Base
8
- def initialize
9
- yield if block_given?
8
+ def initialize(&block)
9
+ instance_eval(&block) if block_given?
10
10
  end
11
11
 
12
12
  def enable_debug
@@ -36,6 +36,7 @@ module Kabutops
36
36
  logger.info(save ? result.to_hash : 'not valid for save') if debug
37
37
  store(result) if save && !debug
38
38
  notify(:after_save, result) if save
39
+ result
39
40
  end
40
41
 
41
42
  def store result
@@ -70,7 +70,7 @@ module Kabutops
70
70
  return if page.nil?
71
71
  return unless (self.class.notify(:store_if, resource, page) || []).all?
72
72
 
73
- adapters.each do |adapter|
73
+ adapters.map do |adapter|
74
74
  adapter.process(resource, page)
75
75
  end
76
76
  rescue Exception => e
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-01 00:00:00.000000000 Z
11
+ date: 2015-11-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -58,42 +58,42 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.1'
61
+ version: '4.0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.1'
68
+ version: '4.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: elasticsearch
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '1.0'
75
+ version: 1.0.14
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '1.0'
82
+ version: 1.0.14
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: hashie
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '3.0'
89
+ version: '3.4'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '3.0'
96
+ version: '3.4'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: json
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -174,7 +174,6 @@ files:
174
174
  - lib/kabutops/extensions/parameterable.rb
175
175
  - lib/kabutops/recipe.rb
176
176
  - lib/kabutops/recipe_item.rb
177
- - lib/kabutops/spider.rb
178
177
  - lib/kabutops/version.rb
179
178
  - lib/kabutops/watchdog.rb
180
179
  homepage: https://github.com/reneklacan/kabutops
@@ -197,7 +196,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
196
  version: '0'
198
197
  requirements: []
199
198
  rubyforge_project:
200
- rubygems_version: 2.4.5
199
+ rubygems_version: 2.4.5.1
201
200
  signing_key:
202
201
  specification_version: 4
203
202
  summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
@@ -1,95 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- class Spider < Crawler
6
- class << self
7
- params :url
8
- callbacks :after_crawl, :before_cache, :follow_if
9
-
10
- def debug_spider
11
- enable_debug
12
- self.new.perform({
13
- url: params[:url]
14
- })
15
- end
16
-
17
- def crawl collection=nil
18
- super(collection || [{ url: params.url, }])
19
- end
20
-
21
- def reset!
22
- super
23
- redis.keys.each{ |k| redis.del(k) }
24
- end
25
-
26
- def << resource
27
- if resource_status(resource).nil?
28
- resource_status(resource, 'new')
29
- super
30
- end
31
- end
32
-
33
- def follow link
34
- self << {
35
- url: URI.join(params.url, URI.escape(link)).to_s
36
- }
37
- end
38
-
39
- def resource_status resource, status=nil
40
- url_status(resource[:url], status)
41
- end
42
-
43
- def url_status url, status=nil
44
- key = redis_key(url)
45
-
46
- if status
47
- redis.set(
48
- key,
49
- JSON.dump({
50
- url: url,
51
- status: status,
52
- })
53
- )
54
- else
55
- item = redis.get(key)
56
- item ? JSON.parse(item)['status'] : nil
57
- end
58
- end
59
-
60
- protected
61
-
62
- def redis_key string
63
- Digest::SHA256.hexdigest(string)
64
- end
65
-
66
- def redis
67
- @redis ||= ::Redis::Namespace.new(
68
- self.to_s,
69
- redis: ::Redis.new(
70
- host: Configuration[:redis][:host],
71
- port: Configuration[:redis][:port],
72
- db: Configuration[:redis][:db],
73
- )
74
- )
75
- end
76
- end
77
-
78
- def crawl resource
79
- page = super
80
- after_crawl(resource, page)
81
- self.class.resource_status(resource, 'done')
82
- page
83
- end
84
-
85
- def after_crawl resource, page
86
- page.css('a').each do |a|
87
- next if a['href'].nil?
88
-
89
- follow = self.class.notify(:follow_if, a['href']).any?
90
- self.class.follow(a['href']) if follow
91
- end
92
- end
93
- end
94
-
95
- end