kabutops 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
4
- data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
3
+ metadata.gz: 0b4f48640fb6de941353f09dee76176c4ce9ff07
4
+ data.tar.gz: 41173f3f0826601540cdcd783da88c4c4dae9169
5
5
  SHA512:
6
- metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
7
- data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
6
+ metadata.gz: 72097b1fa2b063c628fe40c1f02557c40ee4eba0c25e020140511e6154b1e3bcf5b05d89d07c1cd10aaeec6b1b54f03fccd9b25415bf7bfa11661c2403d1a995
7
+ data.tar.gz: c1d42e0efdcc0ba81dfd3fe7c885786c7e639d7a022c00d9999bfa48cd1c75d7d0cc7b8be13c143ce9e84ee017ce3571f24ecc024967c3fd9859af628d296abd
data/README.md CHANGED
@@ -24,7 +24,7 @@ gem install kabutops
24
24
  Or you can put it in your Gemfile
25
25
 
26
26
  ```ruby
27
- gem 'kabutops', '~> 0.0.11'
27
+ gem 'kabutops', '~> 0.0.12'
28
28
  ```
29
29
 
30
30
  You will also need Redis database installed and running.
@@ -17,7 +17,7 @@ module Kabutops
17
17
 
18
18
  params :collection, :proxy, :cache, :wait,
19
19
  :skip_existing, :agent
20
- callbacks :after_crawl
20
+ callbacks :after_crawl, :before_cache
21
21
 
22
22
  def adapters
23
23
  @adapters ||= []
@@ -99,13 +99,18 @@ module Kabutops
99
99
  end
100
100
 
101
101
  def crawl resource
102
- cache_key = (resource[:id] || resource[:url]).to_s
102
+ page = nil
103
+ cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
104
+
103
105
  content = Cachy.cache_if(params.cache, cache_key) do
104
106
  sleep params[:wait] || 0 # wait only if value is not from cache
105
- agent.get(resource[:url]).body
107
+ body = agent.get(resource[:url]).body
108
+ page = Nokogiri::HTML(body)
109
+ self.class.notify(:before_cache, resource, page)
110
+ body
106
111
  end
107
112
 
108
- page = Nokogiri::HTML(content)
113
+ page = Nokogiri::HTML(content) if page.nil?
109
114
  self.class.notify(:after_crawl, resource, page)
110
115
  page
111
116
  end
@@ -5,7 +5,7 @@ module Kabutops
5
5
  class Spider < Crawler
6
6
  class << self
7
7
  params :url
8
- callbacks :after_crawl, :follow_if
8
+ callbacks :after_crawl, :before_cache, :follow_if
9
9
 
10
10
  def debug_spider
11
11
  enable_debug
@@ -14,6 +14,10 @@ module Kabutops
14
14
  })
15
15
  end
16
16
 
17
+ def crawl collection=nil
18
+ super(collection || [{ url: params.url, }])
19
+ end
20
+
17
21
  def << resource
18
22
  if resource_status(resource).nil?
19
23
  resource_status(resource, 'new')
@@ -72,7 +76,7 @@ module Kabutops
72
76
  follow = self.class.notify(:follow_if, a['href']).any?
73
77
  if follow
74
78
  self << {
75
- url: a['href'],
79
+ url: URI.join(params.url, URI.escape(a['href'])).to_s
76
80
  }
77
81
  end
78
82
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.0.11'
4
+ VERSION = '0.0.12'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-15 00:00:00.000000000 Z
11
+ date: 2014-07-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize