kabutops 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 9f6f70b8bec88cff6caf4d04cb1e24e058e411c0
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: c7c77ccc1785e79e2b18b7c00aeae9d8d80954d3
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 1b85f06cbc5310797e30d7cbf4397a4ae08843e4377f478fffecb4a28db4f80884c3a9bf77542e257480fdb2c6a9ca940d35274d215b6eb0fbbf7004fad4ed39
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: f77ef30437460ca717e9e87dcdae84ff5509ff25c3983783bec803d624f7721ca796e3ec222934549b82a68c07821cbb8f6af0f74ee977fe8a91a78557bca4a4
         
     | 
    
        data/lib/kabutops/crawler.rb
    CHANGED
    
    | 
         @@ -64,22 +64,13 @@ module Kabutops 
     | 
|
| 
       64 
64 
     | 
    
         
             
                  resource = Hashie::Mash.new(resource)
         
     | 
| 
       65 
65 
     | 
    
         | 
| 
       66 
66 
     | 
    
         
             
                  adapters = self.class.adapters.select do |adapter|
         
     | 
| 
       67 
     | 
    
         
            -
                     
     | 
| 
       68 
     | 
    
         
            -
                      adapter.find(resource).nil?
         
     | 
| 
       69 
     | 
    
         
            -
                    else
         
     | 
| 
       70 
     | 
    
         
            -
                      true
         
     | 
| 
       71 
     | 
    
         
            -
                    end
         
     | 
| 
      
 67 
     | 
    
         
            +
                    params.skip_existing ? adapter.find(resource).nil? : true
         
     | 
| 
       72 
68 
     | 
    
         
             
                  end
         
     | 
| 
       73 
69 
     | 
    
         | 
| 
       74 
70 
     | 
    
         
             
                  return if adapters.nil?
         
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
       76 
71 
     | 
    
         
             
                  page = crawl(resource)
         
     | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
       78 
72 
     | 
    
         
             
                  return if page.nil?
         
     | 
| 
       79 
     | 
    
         
            -
             
     | 
| 
       80 
     | 
    
         
            -
                  save = (self.class.notify(:store_if, resource, page) || []).all?
         
     | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
     | 
    
         
            -
                  return unless save
         
     | 
| 
      
 73 
     | 
    
         
            +
                  return unless (self.class.notify(:store_if, resource, page) || []).all?
         
     | 
| 
       83 
74 
     | 
    
         | 
| 
       84 
75 
     | 
    
         
             
                  adapters.each do |adapter|
         
     | 
| 
       85 
76 
     | 
    
         
             
                    adapter.process(resource, page)
         
     | 
| 
         @@ -105,19 +96,7 @@ module Kabutops 
     | 
|
| 
       105 
96 
     | 
    
         
             
                end
         
     | 
| 
       106 
97 
     | 
    
         | 
| 
       107 
98 
     | 
    
         
             
                def crawl resource
         
     | 
| 
       108 
     | 
    
         
            -
                  page =  
     | 
| 
       109 
     | 
    
         
            -
                  cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
         
     | 
| 
       110 
     | 
    
         
            -
             
     | 
| 
       111 
     | 
    
         
            -
                  content = Cachy.cache_if(params.cache, cache_key) do
         
     | 
| 
       112 
     | 
    
         
            -
                    sleep params[:wait] || 0 # wait only if value is not from cache
         
     | 
| 
       113 
     | 
    
         
            -
                    body = agent.get(resource[:url]).body
         
     | 
| 
       114 
     | 
    
         
            -
                    body.encode!('utf-8', params[:encoding]) if params[:encoding]
         
     | 
| 
       115 
     | 
    
         
            -
                    page = Nokogiri::HTML(body)
         
     | 
| 
       116 
     | 
    
         
            -
                    self.class.notify(:before_cache, resource, page)
         
     | 
| 
       117 
     | 
    
         
            -
                    body
         
     | 
| 
       118 
     | 
    
         
            -
                  end
         
     | 
| 
       119 
     | 
    
         
            -
             
     | 
| 
       120 
     | 
    
         
            -
                  page = Nokogiri::HTML(content) if page.nil?
         
     | 
| 
      
 99 
     | 
    
         
            +
                  page = get_cache_or_hit(resource)
         
     | 
| 
       121 
100 
     | 
    
         
             
                  self.class.notify(:after_crawl, resource, page)
         
     | 
| 
       122 
101 
     | 
    
         
             
                  page
         
     | 
| 
       123 
102 
     | 
    
         
             
                rescue Mechanize::ResponseCodeError => e
         
     | 
| 
         @@ -129,6 +108,26 @@ module Kabutops 
     | 
|
| 
       129 
108 
     | 
    
         
             
                  end
         
     | 
| 
       130 
109 
     | 
    
         
             
                end
         
     | 
| 
       131 
110 
     | 
    
         | 
| 
      
 111 
     | 
    
         
            +
                def get_cache_or_hit resource
         
     | 
| 
      
 112 
     | 
    
         
            +
                  cache_key = (resource[:id] || Digest::SHA256.hexdigest(resource[:url])).to_s
         
     | 
| 
      
 113 
     | 
    
         
            +
                  page = nil
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
                  content = Cachy.cache_if(params.cache, cache_key) do
         
     | 
| 
      
 116 
     | 
    
         
            +
                    sleep params[:wait] || 0 # wait only if value is not from cache
         
     | 
| 
      
 117 
     | 
    
         
            +
                    page = get_page(resource[:url])
         
     | 
| 
      
 118 
     | 
    
         
            +
                    self.class.notify(:before_cache, resource, page)
         
     | 
| 
      
 119 
     | 
    
         
            +
                    page.to_s
         
     | 
| 
      
 120 
     | 
    
         
            +
                  end
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                  page ? page : Nokogiri::HTML(content)
         
     | 
| 
      
 123 
     | 
    
         
            +
                end
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                def get_page url
         
     | 
| 
      
 126 
     | 
    
         
            +
                  body = agent.get(url).body
         
     | 
| 
      
 127 
     | 
    
         
            +
                  body.encode!('utf-8', params[:encoding]) if params[:encoding]
         
     | 
| 
      
 128 
     | 
    
         
            +
                  Nokogiri::HTML(body)
         
     | 
| 
      
 129 
     | 
    
         
            +
                end
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
       132 
131 
     | 
    
         
             
                def agent
         
     | 
| 
       133 
132 
     | 
    
         
             
                  if params[:agent].is_a?(Proc)
         
     | 
| 
       134 
133 
     | 
    
         
             
                    @agent = params[:agent].call
         
     | 
    
        data/lib/kabutops/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: kabutops
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.7
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Rene Klacan
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2014- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2014-09-19 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: mechanize
         
     | 
| 
         @@ -16,14 +16,14 @@ dependencies: 
     | 
|
| 
       16 
16 
     | 
    
         
             
                requirements:
         
     | 
| 
       17 
17 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       18 
18 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       19 
     | 
    
         
            -
                    version:  
     | 
| 
      
 19 
     | 
    
         
            +
                    version: 2.6.0
         
     | 
| 
       20 
20 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       21 
21 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       22 
22 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       23 
23 
     | 
    
         
             
                requirements:
         
     | 
| 
       24 
24 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       25 
25 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       26 
     | 
    
         
            -
                    version:  
     | 
| 
      
 26 
     | 
    
         
            +
                    version: 2.6.0
         
     | 
| 
       27 
27 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       28 
28 
     | 
    
         
             
              name: cachy
         
     | 
| 
       29 
29 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
         @@ -249,3 +249,4 @@ specification_version: 4 
     | 
|
| 
       249 
249 
     | 
    
         
             
            summary: Dead simple yet powerful Ruby crawler for easy parallel crawling with support
         
     | 
| 
       250 
250 
     | 
    
         
             
              for an anonymity.
         
     | 
| 
       251 
251 
     | 
    
         
             
            test_files: []
         
     | 
| 
      
 252 
     | 
    
         
            +
            has_rdoc: 
         
     |