kabutops 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e931d9a939854fae6497f910de768971907f29a
4
- data.tar.gz: 126e34c2c1eeffa8af3dca362a62381f5889cbe0
3
+ metadata.gz: 87c6590089f52a068980373fdb2b060618b34940
4
+ data.tar.gz: 7505ade7b0104e76c62e73594de557cc75c07944
5
5
  SHA512:
6
- metadata.gz: a869f1388c583d8e2467f096bd6e4702d95832f1f2039253d8b13ec48037e133c9fbbaf68dd44512b69edb4ca4901177757b9ceeedb52efb9aac9587e1ce0a72
7
- data.tar.gz: cccb0968bcf9ce251a7ec81c8a8cbb04b5449d19c96dcf2dd58fd69c259b915f17c9e37d7ed926c229066329018b568d0221824f5ec60578eb61238a8172d4b5
6
+ metadata.gz: a48556126558daa3e7592fcd4eaa7767b8cf3157a4de28e862800783fed992e37dc43995506511063aa15eccf1d6f4fe9b7605fc546c07627f393f57c6c61284
7
+ data.tar.gz: ae9e2711c691fef7135d1e45ea132ff0bd075df7e02c2ee6371251f0388c9b7f73751c539e384d531e9a0fc05be4852fccfe36226477a3b5993ed2dbda9a6861
data/README.md CHANGED
@@ -1,9 +1,9 @@
1
1
  # Kabutops [![Code Climate](https://codeclimate.com/github/reneklacan/kabutops.png)](https://codeclimate.com/github/reneklacan/kabutops) [![Coverage](https://codeclimate.com/github/reneklacan/kabutops/coverage.png)](https://codeclimate.com/github/reneklacan/kabutops)
2
2
 
3
- Kabutops is a ruby library whichs aims to simplify creating website crawlers.
3
+ Kabutops is a ruby library which aims to simplify creating website crawlers.
4
4
  You can define what will be crawled and how it will be saved in the short class definition.
5
5
 
6
- With Kabutops you can easily save data to **ElasticSearch**.
6
+ With Kabutops you can easily save data to **ElasticSearch 2.x**.
7
7
 
8
8
  Example for every kind of database are located
9
9
  in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
@@ -19,7 +19,7 @@ gem install kabutops
19
19
  Or you can put it in your Gemfile
20
20
 
21
21
  ```ruby
22
- gem 'kabutops', '~> 0.1.1'
22
+ gem 'kabutops'
23
23
  ```
24
24
 
25
25
  You will also need Redis database installed and running.
@@ -28,4 +28,3 @@ require 'kabutops/crawler_extensions/pstore_storage'
28
28
  require 'kabutops/crawler_extensions/debugging'
29
29
  require 'kabutops/crawler'
30
30
  require 'kabutops/watchdog'
31
- require 'kabutops/spider'
@@ -5,8 +5,8 @@ module Kabutops
5
5
  module Adapters
6
6
 
7
7
  class Base
8
- def initialize
9
- yield if block_given?
8
+ def initialize(&block)
9
+ instance_eval(&block) if block_given?
10
10
  end
11
11
 
12
12
  def enable_debug
@@ -36,6 +36,7 @@ module Kabutops
36
36
  logger.info(save ? result.to_hash : 'not valid for save') if debug
37
37
  store(result) if save && !debug
38
38
  notify(:after_save, result) if save
39
+ result
39
40
  end
40
41
 
41
42
  def store result
@@ -70,7 +70,7 @@ module Kabutops
70
70
  return if page.nil?
71
71
  return unless (self.class.notify(:store_if, resource, page) || []).all?
72
72
 
73
- adapters.each do |adapter|
73
+ adapters.map do |adapter|
74
74
  adapter.process(resource, page)
75
75
  end
76
76
  rescue Exception => e
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-01 00:00:00.000000000 Z
11
+ date: 2015-11-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -58,42 +58,42 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.1'
61
+ version: '4.0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.1'
68
+ version: '4.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: elasticsearch
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '1.0'
75
+ version: 1.0.14
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '1.0'
82
+ version: 1.0.14
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: hashie
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '3.0'
89
+ version: '3.4'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '3.0'
96
+ version: '3.4'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: json
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -174,7 +174,6 @@ files:
174
174
  - lib/kabutops/extensions/parameterable.rb
175
175
  - lib/kabutops/recipe.rb
176
176
  - lib/kabutops/recipe_item.rb
177
- - lib/kabutops/spider.rb
178
177
  - lib/kabutops/version.rb
179
178
  - lib/kabutops/watchdog.rb
180
179
  homepage: https://github.com/reneklacan/kabutops
@@ -197,7 +196,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
196
  version: '0'
198
197
  requirements: []
199
198
  rubyforge_project:
200
- rubygems_version: 2.4.5
199
+ rubygems_version: 2.4.5.1
201
200
  signing_key:
202
201
  specification_version: 4
203
202
  summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
@@ -1,95 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module Kabutops
4
-
5
- class Spider < Crawler
6
- class << self
7
- params :url
8
- callbacks :after_crawl, :before_cache, :follow_if
9
-
10
- def debug_spider
11
- enable_debug
12
- self.new.perform({
13
- url: params[:url]
14
- })
15
- end
16
-
17
- def crawl collection=nil
18
- super(collection || [{ url: params.url, }])
19
- end
20
-
21
- def reset!
22
- super
23
- redis.keys.each{ |k| redis.del(k) }
24
- end
25
-
26
- def << resource
27
- if resource_status(resource).nil?
28
- resource_status(resource, 'new')
29
- super
30
- end
31
- end
32
-
33
- def follow link
34
- self << {
35
- url: URI.join(params.url, URI.escape(link)).to_s
36
- }
37
- end
38
-
39
- def resource_status resource, status=nil
40
- url_status(resource[:url], status)
41
- end
42
-
43
- def url_status url, status=nil
44
- key = redis_key(url)
45
-
46
- if status
47
- redis.set(
48
- key,
49
- JSON.dump({
50
- url: url,
51
- status: status,
52
- })
53
- )
54
- else
55
- item = redis.get(key)
56
- item ? JSON.parse(item)['status'] : nil
57
- end
58
- end
59
-
60
- protected
61
-
62
- def redis_key string
63
- Digest::SHA256.hexdigest(string)
64
- end
65
-
66
- def redis
67
- @redis ||= ::Redis::Namespace.new(
68
- self.to_s,
69
- redis: ::Redis.new(
70
- host: Configuration[:redis][:host],
71
- port: Configuration[:redis][:port],
72
- db: Configuration[:redis][:db],
73
- )
74
- )
75
- end
76
- end
77
-
78
- def crawl resource
79
- page = super
80
- after_crawl(resource, page)
81
- self.class.resource_status(resource, 'done')
82
- page
83
- end
84
-
85
- def after_crawl resource, page
86
- page.css('a').each do |a|
87
- next if a['href'].nil?
88
-
89
- follow = self.class.notify(:follow_if, a['href']).any?
90
- self.class.follow(a['href']) if follow
91
- end
92
- end
93
- end
94
-
95
- end