kabutops 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c9d30b351bb730c7b4b6d44de1dfb4b34630c702
4
- data.tar.gz: b39802e0b31ad087cd96259957d16624307fe011
3
+ metadata.gz: 9f6f70b8bec88cff6caf4d04cb1e24e058e411c0
4
+ data.tar.gz: c7c77ccc1785e79e2b18b7c00aeae9d8d80954d3
5
5
  SHA512:
6
- metadata.gz: 6ff995f573eeafac0895a0f8095161bda425f970a6095c637e0cf71028e122348a9b7ea2c65cd62e20d013efacb5178a0aa7c26fa6ed9b8e753b5dd61d9a744e
7
- data.tar.gz: f9470dee147282accc9dd9a31debd11706f634e14679f085242febdc874aded7ae9e09439ae3e2a77f55892f6a53e6b887f9832bbeebf5a51bdff694d5fd57f8
6
+ metadata.gz: 1b85f06cbc5310797e30d7cbf4397a4ae08843e4377f478fffecb4a28db4f80884c3a9bf77542e257480fdb2c6a9ca940d35274d215b6eb0fbbf7004fad4ed39
7
+ data.tar.gz: f77ef30437460ca717e9e87dcdae84ff5509ff25c3983783bec803d624f7721ca796e3ec222934549b82a68c07821cbb8f6af0f74ee977fe8a91a78557bca4a4
@@ -41,7 +41,7 @@ module Kabutops
41
41
  raise NotImplementedError
42
42
  end
43
43
 
44
- def nested?
44
+ def find resource
45
45
  raise NotImplementedError
46
46
  end
47
47
  end
@@ -68,10 +68,6 @@ module Kabutops
68
68
  result['hits']['hits'].map{ |hit| hit['_source'] }
69
69
  end
70
70
 
71
- def nested?
72
- true
73
- end
74
-
75
71
  protected
76
72
 
77
73
  def client
@@ -21,10 +21,6 @@ module Kabutops
21
21
  end
22
22
  end
23
23
 
24
- def nested?
25
- true
26
- end
27
-
28
24
  protected
29
25
 
30
26
  def client
@@ -13,8 +13,8 @@ module Kabutops
13
13
  client[result[:id]] = JSON.dump(result.to_hash)
14
14
  end
15
15
 
16
- def nested?
17
- true
16
+ def find resource
17
+ client[resource[:id] || resource[:url]]
18
18
  end
19
19
 
20
20
  protected
@@ -64,22 +64,13 @@ module Kabutops
64
64
  resource = Hashie::Mash.new(resource)
65
65
 
66
66
  adapters = self.class.adapters.select do |adapter|
67
- if params.skip_existing && adapter.respond_to?(:find)
68
- adapter.find(resource).nil?
69
- else
70
- true
71
- end
67
+ params.skip_existing ? adapter.find(resource).nil? : true
72
68
  end
73
69
 
74
70
  return if adapters.nil?
75
-
76
71
  page = crawl(resource)
77
-
78
72
  return if page.nil?
79
-
80
- save = (self.class.notify(:store_if, resource, page) || []).all?
81
-
82
- return unless save
73
+ return unless (self.class.notify(:store_if, resource, page) || []).all?
83
74
 
84
75
  adapters.each do |adapter|
85
76
  adapter.process(resource, page)
@@ -105,19 +96,7 @@ module Kabutops
105
96
  end
106
97
 
107
98
  def crawl resource
108
- page = nil
109
- cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
110
-
111
- content = Cachy.cache_if(params.cache, cache_key) do
112
- sleep params[:wait] || 0 # wait only if value is not from cache
113
- body = agent.get(resource[:url]).body
114
- body.encode!('utf-8', params[:encoding]) if params[:encoding]
115
- page = Nokogiri::HTML(body)
116
- self.class.notify(:before_cache, resource, page)
117
- body
118
- end
119
-
120
- page = Nokogiri::HTML(content) if page.nil?
99
+ page = get_cache_or_hit(resource)
121
100
  self.class.notify(:after_crawl, resource, page)
122
101
  page
123
102
  rescue Mechanize::ResponseCodeError => e
@@ -129,6 +108,26 @@ module Kabutops
129
108
  end
130
109
  end
131
110
 
111
+ def get_cache_or_hit resource
112
+ cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
113
+ page = nil
114
+
115
+ content = Cachy.cache_if(params.cache, cache_key) do
116
+ sleep params[:wait] || 0 # wait only if value is not from cache
117
+ page = get_page(resource[:url])
118
+ self.class.notify(:before_cache, resource, page)
119
+ page.to_s
120
+ end
121
+
122
+ page ? page : Nokogiri::HTML(content)
123
+ end
124
+
125
+ def get_page url
126
+ body = agent.get(url).body
127
+ body.encode!('utf-8', params[:encoding]) if params[:encoding]
128
+ Nokogiri::HTML(body)
129
+ end
130
+
132
131
  def agent
133
132
  if params[:agent].is_a?(Proc)
134
133
  @agent = params[:agent].call
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.1.6'
4
+ VERSION = '0.1.7'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-11 00:00:00.000000000 Z
11
+ date: 2014-09-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: 2.6.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
26
+ version: 2.6.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: cachy
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -249,3 +249,4 @@ specification_version: 4
249
249
  summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
250
250
  for an anonymity.
251
251
  test_files: []
252
+ has_rdoc: