kabutops 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c9d30b351bb730c7b4b6d44de1dfb4b34630c702
4
- data.tar.gz: b39802e0b31ad087cd96259957d16624307fe011
3
+ metadata.gz: 9f6f70b8bec88cff6caf4d04cb1e24e058e411c0
4
+ data.tar.gz: c7c77ccc1785e79e2b18b7c00aeae9d8d80954d3
5
5
  SHA512:
6
- metadata.gz: 6ff995f573eeafac0895a0f8095161bda425f970a6095c637e0cf71028e122348a9b7ea2c65cd62e20d013efacb5178a0aa7c26fa6ed9b8e753b5dd61d9a744e
7
- data.tar.gz: f9470dee147282accc9dd9a31debd11706f634e14679f085242febdc874aded7ae9e09439ae3e2a77f55892f6a53e6b887f9832bbeebf5a51bdff694d5fd57f8
6
+ metadata.gz: 1b85f06cbc5310797e30d7cbf4397a4ae08843e4377f478fffecb4a28db4f80884c3a9bf77542e257480fdb2c6a9ca940d35274d215b6eb0fbbf7004fad4ed39
7
+ data.tar.gz: f77ef30437460ca717e9e87dcdae84ff5509ff25c3983783bec803d624f7721ca796e3ec222934549b82a68c07821cbb8f6af0f74ee977fe8a91a78557bca4a4
@@ -41,7 +41,7 @@ module Kabutops
41
41
  raise NotImplementedError
42
42
  end
43
43
 
44
- def nested?
44
+ def find resource
45
45
  raise NotImplementedError
46
46
  end
47
47
  end
@@ -68,10 +68,6 @@ module Kabutops
68
68
  result['hits']['hits'].map{ |hit| hit['_source'] }
69
69
  end
70
70
 
71
- def nested?
72
- true
73
- end
74
-
75
71
  protected
76
72
 
77
73
  def client
@@ -21,10 +21,6 @@ module Kabutops
21
21
  end
22
22
  end
23
23
 
24
- def nested?
25
- true
26
- end
27
-
28
24
  protected
29
25
 
30
26
  def client
@@ -13,8 +13,8 @@ module Kabutops
13
13
  client[result[:id]] = JSON.dump(result.to_hash)
14
14
  end
15
15
 
16
- def nested?
17
- true
16
+ def find resource
17
+ client[resource[:id] || resource[:url]]
18
18
  end
19
19
 
20
20
  protected
@@ -64,22 +64,13 @@ module Kabutops
64
64
  resource = Hashie::Mash.new(resource)
65
65
 
66
66
  adapters = self.class.adapters.select do |adapter|
67
- if params.skip_existing && adapter.respond_to?(:find)
68
- adapter.find(resource).nil?
69
- else
70
- true
71
- end
67
+ params.skip_existing ? adapter.find(resource).nil? : true
72
68
  end
73
69
 
74
70
  return if adapters.nil?
75
-
76
71
  page = crawl(resource)
77
-
78
72
  return if page.nil?
79
-
80
- save = (self.class.notify(:store_if, resource, page) || []).all?
81
-
82
- return unless save
73
+ return unless (self.class.notify(:store_if, resource, page) || []).all?
83
74
 
84
75
  adapters.each do |adapter|
85
76
  adapter.process(resource, page)
@@ -105,19 +96,7 @@ module Kabutops
105
96
  end
106
97
 
107
98
  def crawl resource
108
- page = nil
109
- cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
110
-
111
- content = Cachy.cache_if(params.cache, cache_key) do
112
- sleep params[:wait] || 0 # wait only if value is not from cache
113
- body = agent.get(resource[:url]).body
114
- body.encode!('utf-8', params[:encoding]) if params[:encoding]
115
- page = Nokogiri::HTML(body)
116
- self.class.notify(:before_cache, resource, page)
117
- body
118
- end
119
-
120
- page = Nokogiri::HTML(content) if page.nil?
99
+ page = get_cache_or_hit(resource)
121
100
  self.class.notify(:after_crawl, resource, page)
122
101
  page
123
102
  rescue Mechanize::ResponseCodeError => e
@@ -129,6 +108,26 @@ module Kabutops
129
108
  end
130
109
  end
131
110
 
111
+ def get_cache_or_hit resource
112
+ cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
113
+ page = nil
114
+
115
+ content = Cachy.cache_if(params.cache, cache_key) do
116
+ sleep params[:wait] || 0 # wait only if value is not from cache
117
+ page = get_page(resource[:url])
118
+ self.class.notify(:before_cache, resource, page)
119
+ page.to_s
120
+ end
121
+
122
+ page ? page : Nokogiri::HTML(content)
123
+ end
124
+
125
+ def get_page url
126
+ body = agent.get(url).body
127
+ body.encode!('utf-8', params[:encoding]) if params[:encoding]
128
+ Nokogiri::HTML(body)
129
+ end
130
+
132
131
  def agent
133
132
  if params[:agent].is_a?(Proc)
134
133
  @agent = params[:agent].call
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.1.6'
4
+ VERSION = '0.1.7'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-11 00:00:00.000000000 Z
11
+ date: 2014-09-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: 2.6.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
26
+ version: 2.6.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: cachy
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -249,3 +249,4 @@ specification_version: 4
249
249
  summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
250
250
  for an anonymity.
251
251
  test_files: []
252
+ has_rdoc: