kabutops 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -5
- data/lib/kabutops.rb +1 -3
- data/lib/kabutops/configuration.rb +5 -0
- data/lib/kabutops/crawler.rb +24 -20
- data/lib/kabutops/spider.rb +82 -0
- data/lib/kabutops/version.rb +1 -1
- metadata +3 -18
- data/lib/kabutops/adapters/sequel.rb +0 -47
- data/lib/kabutops/crawler_extensions/sequel.rb +0 -19
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 491dade458c54b02dbcdd36da92010fcaa1a5463
         | 
| 4 | 
            +
              data.tar.gz: 9d00eb9b6ad5e61ddb27dfa7137f0734e9eda03f
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 9d00a7d31dfa94e85c9f2d31c49d56b3df2da5fd1b0580c769724f64bad12b331bc8c87a3432071944bc427543458ffe0257743ba5f6dd1840e7e0a8674e7e88
         | 
| 7 | 
            +
              data.tar.gz: aa6a67c2dcbaf8e14cf6d4a38e94b78979ef9f594ad492e5618bd3d1db36fbca305895e1c756cae2f355d21045fd82489ee315eb7150a2cac7f17d6fbc744fdb
         | 
    
        data/README.md
    CHANGED
    
    | @@ -8,7 +8,6 @@ With Kabutops you can save data easily to: | |
| 8 8 | 
             
            * ElasticSearch
         | 
| 9 9 | 
             
            * MongoDB
         | 
| 10 10 | 
             
            * Redis
         | 
| 11 | 
            -
            * SQL Databases (via Sequel)
         | 
| 12 11 |  | 
| 13 12 | 
             
            Example for every kind of database are located
         | 
| 14 13 | 
             
            in the [examples directory](https://github.com/reneklacan/kabutops/tree/master/examples)
         | 
| @@ -25,9 +24,11 @@ gem install kabutops | |
| 25 24 | 
             
            Or you can put it in your Gemfile
         | 
| 26 25 |  | 
| 27 26 | 
             
            ```ruby
         | 
| 28 | 
            -
            gem 'kabutops', '~> 0.0. | 
| 27 | 
            +
            gem 'kabutops', '~> 0.0.11'
         | 
| 29 28 | 
             
            ```
         | 
| 30 29 |  | 
| 30 | 
            +
            You will also need Redis database installed and running.
         | 
| 31 | 
            +
             | 
| 31 32 | 
             
            Basic example
         | 
| 32 33 | 
             
            -------------
         | 
| 33 34 |  | 
| @@ -199,12 +200,28 @@ class MyCrawler < Kabutops::Crawler | |
| 199 200 | 
             
            end
         | 
| 200 201 | 
             
            ```
         | 
| 201 202 |  | 
| 203 | 
            +
            Javascript heavy site
         | 
| 204 | 
            +
            ---------------------
         | 
| 205 | 
            +
             | 
| 206 | 
            +
            Crawling this kind of sites can be achieved by using non-default agent
         | 
| 207 | 
            +
            (default is Mechanize.new).
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            ```ruby
         | 
| 210 | 
            +
            class MyCrawler < Kabutops::Crawler
         | 
| 211 | 
            +
              ...
         | 
| 212 | 
            +
              agent Bogeyman::Client.new
         | 
| 213 | 
            +
              ...
         | 
| 214 | 
            +
            end
         | 
| 215 | 
            +
            ```
         | 
| 216 | 
            +
             | 
| 217 | 
            +
            [Bogeyman](https://github.com/reneklacan/bogeyman-ruby-client)
         | 
| 218 | 
            +
            is wrapper build upon Phantomjs.
         | 
| 219 | 
            +
             | 
| 202 220 | 
             
            TODO
         | 
| 203 221 | 
             
            ----
         | 
| 204 222 |  | 
| 205 | 
            -
            * Watchdog for Mongo | 
| 206 | 
            -
            * skip_existing for Mongo, Redis | 
| 207 | 
            -
            * Spider
         | 
| 223 | 
            +
            * Watchdog for Mongo
         | 
| 224 | 
            +
            * skip_existing feature for Mongo, Redis
         | 
| 208 225 |  | 
| 209 226 | 
             
            License
         | 
| 210 227 | 
             
            -------
         | 
    
        data/lib/kabutops.rb
    CHANGED
    
    | @@ -11,7 +11,6 @@ require 'elasticsearch' | |
| 11 11 | 
             
            require 'redis'
         | 
| 12 12 | 
             
            require 'redis-namespace'
         | 
| 13 13 | 
             
            require 'mongo'
         | 
| 14 | 
            -
            require 'sequel'
         | 
| 15 14 | 
             
            require 'mysql2'
         | 
| 16 15 | 
             
            require 'logger'
         | 
| 17 16 |  | 
| @@ -30,12 +29,11 @@ require 'kabutops/adapters/database_adapter' | |
| 30 29 | 
             
            require 'kabutops/adapters/elastic_search'
         | 
| 31 30 | 
             
            require 'kabutops/adapters/redis'
         | 
| 32 31 | 
             
            require 'kabutops/adapters/mongo'
         | 
| 33 | 
            -
            require 'kabutops/adapters/sequel'
         | 
| 34 32 | 
             
            require 'kabutops/crawler_extensions/elastic_search'
         | 
| 35 33 | 
             
            require 'kabutops/crawler_extensions/redis'
         | 
| 36 34 | 
             
            require 'kabutops/crawler_extensions/mongo'
         | 
| 37 | 
            -
            require 'kabutops/crawler_extensions/sequel'
         | 
| 38 35 | 
             
            require 'kabutops/crawler_extensions/pstore_storage'
         | 
| 39 36 | 
             
            require 'kabutops/crawler_extensions/debugging'
         | 
| 40 37 | 
             
            require 'kabutops/crawler'
         | 
| 41 38 | 
             
            require 'kabutops/watchdog'
         | 
| 39 | 
            +
            require 'kabutops/spider'
         | 
    
        data/lib/kabutops/crawler.rb
    CHANGED
    
    | @@ -9,14 +9,14 @@ module Kabutops | |
| 9 9 | 
             
                include CrawlerExtensions::ElasticSearch
         | 
| 10 10 | 
             
                include CrawlerExtensions::Redis
         | 
| 11 11 | 
             
                include CrawlerExtensions::Mongo
         | 
| 12 | 
            -
                include CrawlerExtensions::Sequel
         | 
| 13 12 | 
             
                include Sidekiq::Worker
         | 
| 14 13 |  | 
| 15 14 | 
             
                class << self
         | 
| 16 15 | 
             
                  include Extensions::Parameterable
         | 
| 17 16 | 
             
                  include Extensions::CallbackSupport
         | 
| 18 17 |  | 
| 19 | 
            -
                  params :collection, :proxy, :cache, :wait, | 
| 18 | 
            +
                  params :collection, :proxy, :cache, :wait,
         | 
| 19 | 
            +
                         :skip_existing, :agent
         | 
| 20 20 | 
             
                  callbacks :after_crawl
         | 
| 21 21 |  | 
| 22 22 | 
             
                  def adapters
         | 
| @@ -62,29 +62,29 @@ module Kabutops | |
| 62 62 |  | 
| 63 63 | 
             
                def perform resource
         | 
| 64 64 | 
             
                  resource = Hashie::Mash.new(resource)
         | 
| 65 | 
            -
                  adapters = self.class.adapters
         | 
| 66 | 
            -
             | 
| 67 | 
            -
                  if self.class.params.skip_existing
         | 
| 68 | 
            -
                    adapters = self.class.adapters.select do |adapter|
         | 
| 69 | 
            -
                      if adapter.respond_to? :find
         | 
| 70 | 
            -
                        adapter.find(resource).nil?
         | 
| 71 | 
            -
                      else
         | 
| 72 | 
            -
                        true
         | 
| 73 | 
            -
                      end
         | 
| 74 | 
            -
                    end
         | 
| 75 65 |  | 
| 76 | 
            -
             | 
| 66 | 
            +
                  adapters = self.class.adapters.select do |adapter|
         | 
| 67 | 
            +
                    if params.skip_existing && adapter.respond_to?(:find)
         | 
| 68 | 
            +
                      adapter.find(resource).nil?
         | 
| 69 | 
            +
                    else
         | 
| 70 | 
            +
                      true
         | 
| 71 | 
            +
                    end
         | 
| 77 72 | 
             
                  end
         | 
| 78 73 |  | 
| 74 | 
            +
                  return if adapters.nil?
         | 
| 75 | 
            +
             | 
| 79 76 | 
             
                  page = crawl(resource)
         | 
| 80 77 |  | 
| 81 78 | 
             
                  adapters.each do |adapter|
         | 
| 82 79 | 
             
                    adapter.process(resource, page)
         | 
| 83 80 | 
             
                  end
         | 
| 84 81 | 
             
                rescue Exception => e
         | 
| 85 | 
            -
                   | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 82 | 
            +
                  unless self.class.debug
         | 
| 83 | 
            +
                    logger.error(e.message)
         | 
| 84 | 
            +
                    logger.error(e.backtrace.join("\n"))
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                  sleep params[:wait] || 0
         | 
| 88 88 | 
             
                  raise e
         | 
| 89 89 | 
             
                end
         | 
| 90 90 |  | 
| @@ -94,10 +94,14 @@ module Kabutops | |
| 94 94 |  | 
| 95 95 | 
             
                protected
         | 
| 96 96 |  | 
| 97 | 
            +
                def params
         | 
| 98 | 
            +
                  self.class.params
         | 
| 99 | 
            +
                end
         | 
| 100 | 
            +
             | 
| 97 101 | 
             
                def crawl resource
         | 
| 98 102 | 
             
                  cache_key = (resource[:id] || resource[:url]).to_s
         | 
| 99 | 
            -
                  content = Cachy.cache_if( | 
| 100 | 
            -
                    sleep  | 
| 103 | 
            +
                  content = Cachy.cache_if(params.cache, cache_key) do
         | 
| 104 | 
            +
                    sleep params[:wait] || 0 # wait only if value is not from cache
         | 
| 101 105 | 
             
                    agent.get(resource[:url]).body
         | 
| 102 106 | 
             
                  end
         | 
| 103 107 |  | 
| @@ -108,8 +112,8 @@ module Kabutops | |
| 108 112 |  | 
| 109 113 | 
             
                def agent
         | 
| 110 114 | 
             
                  unless @agent
         | 
| 111 | 
            -
                    @agent = Mechanize.new
         | 
| 112 | 
            -
                    @agent.set_proxy(* | 
| 115 | 
            +
                    @agent = params[:agent] || Mechanize.new
         | 
| 116 | 
            +
                    @agent.set_proxy(*params[:proxy]) if params[:proxy]
         | 
| 113 117 | 
             
                  end
         | 
| 114 118 |  | 
| 115 119 | 
             
                  @agent
         | 
| @@ -0,0 +1,82 @@ | |
| 1 | 
            +
            # -*- encoding : utf-8 -*-
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Kabutops
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              class Spider < Crawler
         | 
| 6 | 
            +
                class << self
         | 
| 7 | 
            +
                  params :url
         | 
| 8 | 
            +
                  callbacks :after_crawl, :follow_if
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  def debug_spider
         | 
| 11 | 
            +
                    enable_debug
         | 
| 12 | 
            +
                    self.new.perform({
         | 
| 13 | 
            +
                      url: params[:url]
         | 
| 14 | 
            +
                    })
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  def << resource
         | 
| 18 | 
            +
                    if resource_status(resource).nil?
         | 
| 19 | 
            +
                      resource_status(resource, 'new')
         | 
| 20 | 
            +
                      super
         | 
| 21 | 
            +
                    end
         | 
| 22 | 
            +
                  end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  def resource_status resource, status=nil
         | 
| 25 | 
            +
                    url_status(resource[:url], status)
         | 
| 26 | 
            +
                  end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  def url_status url, status=nil
         | 
| 29 | 
            +
                    key = redis_key(url)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    if status
         | 
| 32 | 
            +
                      redis.set(
         | 
| 33 | 
            +
                        key,
         | 
| 34 | 
            +
                        JSON.dump({
         | 
| 35 | 
            +
                          url: url,
         | 
| 36 | 
            +
                          status: status,
         | 
| 37 | 
            +
                        })
         | 
| 38 | 
            +
                      )
         | 
| 39 | 
            +
                    else
         | 
| 40 | 
            +
                      item = redis.get(key)
         | 
| 41 | 
            +
                      item ? JSON.parse(item)['status'] : nil
         | 
| 42 | 
            +
                    end
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  protected
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  def redis_key string
         | 
| 48 | 
            +
                    Digest::SHA256.hexdigest(string)
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  def redis
         | 
| 52 | 
            +
                    @redis ||= ::Redis::Namespace.new(
         | 
| 53 | 
            +
                      self.to_s,
         | 
| 54 | 
            +
                      redis: ::Redis.new(
         | 
| 55 | 
            +
                        host: Configuration[:redis][:host],
         | 
| 56 | 
            +
                        port: Configuration[:redis][:port],
         | 
| 57 | 
            +
                        db: Configuration[:redis][:db],
         | 
| 58 | 
            +
                      )
         | 
| 59 | 
            +
                    )
         | 
| 60 | 
            +
                  end
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                def crawl resource
         | 
| 64 | 
            +
                  page = super
         | 
| 65 | 
            +
                  after_crawl(resource, page)
         | 
| 66 | 
            +
                  self.class.resource_status(resource, 'done')
         | 
| 67 | 
            +
                  page
         | 
| 68 | 
            +
                end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                def after_crawl resource, page
         | 
| 71 | 
            +
                  page.css('a').each do |a|
         | 
| 72 | 
            +
                    follow = self.class.notify(:follow_if, a['href']).any?
         | 
| 73 | 
            +
                    if follow
         | 
| 74 | 
            +
                      self << {
         | 
| 75 | 
            +
                        url: a['href'],
         | 
| 76 | 
            +
                      }
         | 
| 77 | 
            +
                    end
         | 
| 78 | 
            +
                  end
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
              end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            end
         | 
    
        data/lib/kabutops/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: kabutops
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.11
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Rene Klacan
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2014-07- | 
| 11 | 
            +
            date: 2014-07-15 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: mechanize
         | 
| @@ -150,20 +150,6 @@ dependencies: | |
| 150 150 | 
             
                - - "~>"
         | 
| 151 151 | 
             
                  - !ruby/object:Gem::Version
         | 
| 152 152 | 
             
                    version: '1.10'
         | 
| 153 | 
            -
            - !ruby/object:Gem::Dependency
         | 
| 154 | 
            -
              name: sequel
         | 
| 155 | 
            -
              requirement: !ruby/object:Gem::Requirement
         | 
| 156 | 
            -
                requirements:
         | 
| 157 | 
            -
                - - "~>"
         | 
| 158 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 159 | 
            -
                    version: '4.11'
         | 
| 160 | 
            -
              type: :runtime
         | 
| 161 | 
            -
              prerelease: false
         | 
| 162 | 
            -
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 163 | 
            -
                requirements:
         | 
| 164 | 
            -
                - - "~>"
         | 
| 165 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 166 | 
            -
                    version: '4.11'
         | 
| 167 153 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 168 154 | 
             
              name: bson_ext
         | 
| 169 155 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -235,7 +221,6 @@ files: | |
| 235 221 | 
             
            - lib/kabutops/adapters/elastic_search.rb
         | 
| 236 222 | 
             
            - lib/kabutops/adapters/mongo.rb
         | 
| 237 223 | 
             
            - lib/kabutops/adapters/redis.rb
         | 
| 238 | 
            -
            - lib/kabutops/adapters/sequel.rb
         | 
| 239 224 | 
             
            - lib/kabutops/configuration.rb
         | 
| 240 225 | 
             
            - lib/kabutops/crawler.rb
         | 
| 241 226 | 
             
            - lib/kabutops/crawler_extensions/debugging.rb
         | 
| @@ -243,13 +228,13 @@ files: | |
| 243 228 | 
             
            - lib/kabutops/crawler_extensions/mongo.rb
         | 
| 244 229 | 
             
            - lib/kabutops/crawler_extensions/pstore_storage.rb
         | 
| 245 230 | 
             
            - lib/kabutops/crawler_extensions/redis.rb
         | 
| 246 | 
            -
            - lib/kabutops/crawler_extensions/sequel.rb
         | 
| 247 231 | 
             
            - lib/kabutops/extensions/callback_support.rb
         | 
| 248 232 | 
             
            - lib/kabutops/extensions/includable.rb
         | 
| 249 233 | 
             
            - lib/kabutops/extensions/logging.rb
         | 
| 250 234 | 
             
            - lib/kabutops/extensions/parameterable.rb
         | 
| 251 235 | 
             
            - lib/kabutops/recipe.rb
         | 
| 252 236 | 
             
            - lib/kabutops/recipe_item.rb
         | 
| 237 | 
            +
            - lib/kabutops/spider.rb
         | 
| 253 238 | 
             
            - lib/kabutops/version.rb
         | 
| 254 239 | 
             
            - lib/kabutops/watchdog.rb
         | 
| 255 240 | 
             
            homepage: https://github.com/reneklacan/kabutops
         | 
| @@ -1,47 +0,0 @@ | |
| 1 | 
            -
            # -*- encoding : utf-8 -*-
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module Kabutops
         | 
| 4 | 
            -
             | 
| 5 | 
            -
              module Adapters
         | 
| 6 | 
            -
             | 
| 7 | 
            -
                class Sequel < DatabaseAdapter
         | 
| 8 | 
            -
                  include Extensions::Parameterable
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                  params :connect, :type, :host, :port,
         | 
| 11 | 
            -
                         :db, :user, :password, :table
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  def store result
         | 
| 14 | 
            -
                    client_table.insert(result)
         | 
| 15 | 
            -
                  end
         | 
| 16 | 
            -
             | 
| 17 | 
            -
                  def nested?
         | 
| 18 | 
            -
                    false
         | 
| 19 | 
            -
                  end
         | 
| 20 | 
            -
             | 
| 21 | 
            -
                  protected
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                  def client
         | 
| 24 | 
            -
                    return @@client if defined?(@@client)
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                    if params[:connect]
         | 
| 27 | 
            -
                      @@client ||= ::Sequel.connect(params[:connect])
         | 
| 28 | 
            -
                    else
         | 
| 29 | 
            -
                      @@client ||= ::Sequel.connect(
         | 
| 30 | 
            -
                        adapter: params[:type] || 'mysql2',
         | 
| 31 | 
            -
                        user: params[:user] || 'root',
         | 
| 32 | 
            -
                        password: params[:password] || 'root',
         | 
| 33 | 
            -
                        host: params[:host] || 'localhost',
         | 
| 34 | 
            -
                        port: params[:port] || 3306,
         | 
| 35 | 
            -
                        database: params[:db] || params[:database] || 'kabutops',
         | 
| 36 | 
            -
                      )
         | 
| 37 | 
            -
                    end
         | 
| 38 | 
            -
                  end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                  def client_table
         | 
| 41 | 
            -
                    @@client_table ||= client[params[:table]]
         | 
| 42 | 
            -
                  end
         | 
| 43 | 
            -
                end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
              end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
            end
         | 
| @@ -1,19 +0,0 @@ | |
| 1 | 
            -
            # -*- encoding : utf-8 -*-
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            module Kabutops
         | 
| 4 | 
            -
             | 
| 5 | 
            -
              module CrawlerExtensions
         | 
| 6 | 
            -
             | 
| 7 | 
            -
                module Sequel
         | 
| 8 | 
            -
                  extend Extensions::Includable
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                  module ClassMethods
         | 
| 11 | 
            -
                    def sequel &block
         | 
| 12 | 
            -
                      adapters << Adapters::Sequel.new(&block)
         | 
| 13 | 
            -
                    end
         | 
| 14 | 
            -
                  end
         | 
| 15 | 
            -
                end
         | 
| 16 | 
            -
             | 
| 17 | 
            -
              end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
            end
         |