web_crawler 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/README +22 -1
- data/lib/web_crawler.rb +2 -0
- data/lib/web_crawler/application.rb +33 -2
- data/lib/web_crawler/base.rb +113 -0
- data/lib/web_crawler/batch_request.rb +10 -4
- data/lib/web_crawler/cached_request.rb +16 -7
- data/lib/web_crawler/configuration.rb +5 -5
- data/lib/web_crawler/factory_url.rb +27 -7
- data/lib/web_crawler/follower.rb +11 -9
- data/lib/web_crawler/parsers.rb +1 -0
- data/lib/web_crawler/parsers/mapper.rb +114 -0
- data/lib/web_crawler/parsers/url.rb +3 -5
- data/lib/web_crawler/request.rb +14 -2
- data/lib/web_crawler/response.rb +2 -2
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view.rb +1 -1
- data/lib/web_crawler/view/csv.rb +1 -1
- data/lib/web_crawler/view/json.rb +1 -1
- data/lib/web_crawler/view/yaml.rb +1 -1
- data/spec/fixtures/example.xml +171 -0
- data/spec/fixtures/my_crawler.rb +82 -0
- data/spec/fixtures/test_crawler.rb +108 -0
- data/spec/fixtures/test_crawler2.rb +77 -0
- data/spec/spec_helper.rb +8 -3
- data/spec/web_crawler/batch_request_spec.rb +0 -11
- data/spec/web_crawler/cached_request_spec.rb +17 -11
- data/spec/web_crawler/factory_url_spec.rb +19 -6
- data/spec/web_crawler/follow_spec.rb +11 -4
- data/spec/web_crawler/view_spec.rb +10 -10
- data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
- data/web_crawler.gemspec +2 -0
- metadata +43 -8
    
        data/Gemfile
    CHANGED
    
    
    
        data/README
    CHANGED
    
    | @@ -1 +1,22 @@ | |
| 1 | 
            -
            Web crawler help you with parse and collect data from the web
         | 
| 1 | 
            +
            Web crawler help you with parse and collect data from the web
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            #TODO
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Base web crawler class for API present
         | 
| 6 | 
            +
            Its showld work like this:
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            class MyCrawler < WebCrawler::Base
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                target "www.example.com"
         | 
| 11 | 
            +
                target "www.example.com/page2"
         | 
| 12 | 
            +
                target %[www.example.com/contacts www.example.com/about]
         | 
| 13 | 
            +
                target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                target { call_advanced_logic_for_url_generating }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                logger "path/to/log/file" # or Logger.new(...)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
                
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            end
         | 
    
        data/lib/web_crawler.rb
    CHANGED
    
    | @@ -6,6 +6,7 @@ require 'forwardable' | |
| 6 6 | 
             
            require "ext/hash"
         | 
| 7 7 | 
             
            require "ext/array"
         | 
| 8 8 | 
             
            require "ext/http_response"
         | 
| 9 | 
            +
            require "active_support/core_ext"
         | 
| 9 10 |  | 
| 10 11 | 
             
            module WebCrawler
         | 
| 11 12 | 
             
              autoload :Request, 'web_crawler/request'
         | 
| @@ -26,6 +27,7 @@ module WebCrawler | |
| 26 27 | 
             
              autoload :View, 'web_crawler/view'
         | 
| 27 28 | 
             
              autoload :CLI, 'web_crawler/cli'
         | 
| 28 29 | 
             
              autoload :Application, 'web_crawler/application'
         | 
| 30 | 
            +
              autoload :Base, 'web_crawler/base'
         | 
| 29 31 |  | 
| 30 32 | 
             
              include Configurable
         | 
| 31 33 | 
             
              extend Utility
         | 
| @@ -2,7 +2,35 @@ module WebCrawler | |
| 2 2 | 
             
              class Application < CLI
         | 
| 3 3 |  | 
| 4 4 | 
             
                desc "test", "Test task"
         | 
| 5 | 
            +
             | 
| 5 6 | 
             
                def test
         | 
| 7 | 
            +
                  urls = FactoryUrl.new('http://www.superjob.ru/rabota/554/veb-programmist/?from=$1', [[140]]).factory
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  logger.info "start requests with #{urls.join(' ')} in 4 processes"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  targets = BatchRequest.new(urls).process
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  logger.info "#{targets.size} targets collected"
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  urls = Follower.new(targets, same_host: false).collect { |url| url =~ /vacancy\/\?id=\d+/ }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  logger.info "#{urls.size} urls collected"
         | 
| 18 | 
            +
                  logger.info "start requests with in 4 processes"
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  puts BatchRequest.new(urls).process.inspect
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                  ""
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                desc "runner CLASS", "Run crawler class"
         | 
| 26 | 
            +
                method_option :lib, type: :array, desc: "lib directories"
         | 
| 27 | 
            +
                def runner(name)
         | 
| 28 | 
            +
                  $:.unshift './'
         | 
| 29 | 
            +
                  Array.wrap(@options[:lib]).each { |l| $:.unshift l }
         | 
| 30 | 
            +
                  require name.underscore
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  klass = name.classify.constantize
         | 
| 33 | 
            +
                  klass.run allow_format(:json, :yaml)
         | 
| 6 34 | 
             
                end
         | 
| 7 35 |  | 
| 8 36 | 
             
                desc "get <URL...>", "Get pages from passed urls"
         | 
| @@ -36,8 +64,7 @@ module WebCrawler | |
| 36 64 | 
             
                def factory(pattern, *params)
         | 
| 37 65 | 
             
                  params.map! { |param| eval(param) }
         | 
| 38 66 | 
             
                  urls = FactoryUrl.new(pattern, params)
         | 
| 39 | 
            -
                   | 
| 40 | 
            -
                  sep = options[:list] ? "\n" : ' '
         | 
| 67 | 
            +
                  sep  = options[:list] ? "\n" : ' '
         | 
| 41 68 | 
             
                  if options[:output] || options[:list]
         | 
| 42 69 | 
             
                    puts urls.factory.map { |u| u.inspect }.join(sep).gsub('"', "'")
         | 
| 43 70 | 
             
                  else
         | 
| @@ -45,5 +72,9 @@ module WebCrawler | |
| 45 72 | 
             
                  end
         | 
| 46 73 | 
             
                end
         | 
| 47 74 |  | 
| 75 | 
            +
                protected
         | 
| 76 | 
            +
                def allow_format(*allow)
         | 
| 77 | 
            +
                  allow.flatten.select { |f| f == @options[:format] }.first
         | 
| 78 | 
            +
                end
         | 
| 48 79 | 
             
              end
         | 
| 49 80 | 
             
            end
         | 
| @@ -0,0 +1,113 @@ | |
| 1 | 
            +
            require "set"
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module WebCrawler
         | 
| 4 | 
            +
              class Base
         | 
| 5 | 
            +
                class_attribute :targets, :logger, :mappers, :cache, :follower, :responses
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                self.targets, self.logger, self.mappers = Set.new, WebCrawler.config.logger, []
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                delegate :run, :to => :'self.class'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                class << self
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  include ActiveSupport::Callbacks
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  def run(format=nil, format_options={ })
         | 
| 16 | 
            +
                    compile_targets
         | 
| 17 | 
            +
                    self.responses = WebCrawler::BatchRequest.new(targets.to_a).process
         | 
| 18 | 
            +
                    if format
         | 
| 19 | 
            +
                      formated(process(responses), format, format_options)
         | 
| 20 | 
            +
                    else
         | 
| 21 | 
            +
                      process(responses)
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  protected
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  def after(&block)
         | 
| 28 | 
            +
                    @after_callback = block
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  def compile_targets
         | 
| 32 | 
            +
                    following    = targets.select { |target| target.is_a?(Array) && target.first.is_a?(Proc) }
         | 
| 33 | 
            +
                    self.targets = targets - following
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    following.each do |target|
         | 
| 36 | 
            +
                      target.first.call(target.last)
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
                  end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                  def log_to(logger_or_path)
         | 
| 41 | 
            +
                    case logger_or_path
         | 
| 42 | 
            +
                      when Logger
         | 
| 43 | 
            +
                        WebCrawler.config.logger = self.logger = logger_or_path
         | 
| 44 | 
            +
                      when nil
         | 
| 45 | 
            +
                        WebCrawler.config.logger = self.logger = Logger.new('/dev/null')
         | 
| 46 | 
            +
                      else
         | 
| 47 | 
            +
                        WebCrawler.config.logger = self.logger = Logger.new(logger_or_path)
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  def cache_to(path_or_cache_adapter)
         | 
| 52 | 
            +
                    adapter = nil
         | 
| 53 | 
            +
                    adapter = path_or_cache_adapter if path_or_cache_adapter.is_a? WebCrawler::CacheAdapter::Base
         | 
| 54 | 
            +
                    adapter = WebCrawler::CacheAdapter::File.new(path_or_cache_adapter) if File.directory? path_or_cache_adapter
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    WebCrawler.configure do
         | 
| 57 | 
            +
                      config.cache.adapter = adapter
         | 
| 58 | 
            +
                    end if adapter
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  def follow(*targets)
         | 
| 62 | 
            +
                    options   = targets.extract_options!
         | 
| 63 | 
            +
                    responses = WebCrawler::BatchRequest.new(targets).process
         | 
| 64 | 
            +
                    self.target WebCrawler::Follower.new(responses, options).collect
         | 
| 65 | 
            +
                  end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                  def context(selector, name=selector, &block)
         | 
| 68 | 
            +
                    mapper = WebCrawler::Parsers::Mapper.new(name, self, selector)
         | 
| 69 | 
            +
                    if block.arity.zero?
         | 
| 70 | 
            +
                      mapper.instance_exec(&block)
         | 
| 71 | 
            +
                    else
         | 
| 72 | 
            +
                      mapper.callback(&block)
         | 
| 73 | 
            +
                    end
         | 
| 74 | 
            +
                    self.mappers += [mapper]
         | 
| 75 | 
            +
                  end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                  def target(*targets, &block)
         | 
| 78 | 
            +
                    options = targets.extract_options!
         | 
| 79 | 
            +
                    unless options.empty?
         | 
| 80 | 
            +
                      raise ArgumentError, 'target accept only one pattern if options given' if targets.size > 1
         | 
| 81 | 
            +
                      targets = generate_urls(targets.first, options)
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                    if block_given?
         | 
| 84 | 
            +
                      self.targets << [block, targets]
         | 
| 85 | 
            +
                    else
         | 
| 86 | 
            +
                      self.targets += targets.flatten
         | 
| 87 | 
            +
                    end
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                  def generate_urls(pattern, options)
         | 
| 91 | 
            +
                    WebCrawler::FactoryUrl.new(pattern, options).factory
         | 
| 92 | 
            +
                  end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  def formated(data, format, options)
         | 
| 95 | 
            +
                    require "active_support/core_ext/string"
         | 
| 96 | 
            +
                    WebCrawler::View.factory(format, data, options).render
         | 
| 97 | 
            +
                  end
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                  def process(responses)
         | 
| 100 | 
            +
                    return responses.map(&:to_s) if mappers.empty?
         | 
| 101 | 
            +
                    
         | 
| 102 | 
            +
                    { }.tap do |results|
         | 
| 103 | 
            +
                      mappers.each do |mapper|
         | 
| 104 | 
            +
                        results[mapper.name] = responses.map do |response|
         | 
| 105 | 
            +
                          mapper.collect(response)
         | 
| 106 | 
            +
                        end.flatten
         | 
| 107 | 
            +
                      end
         | 
| 108 | 
            +
                    end
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
                end
         | 
| 111 | 
            +
             | 
| 112 | 
            +
              end
         | 
| 113 | 
            +
            end
         | 
| @@ -1,10 +1,15 @@ | |
| 1 | 
            +
            require "parallel"
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            module WebCrawler
         | 
| 2 4 |  | 
| 5 | 
            +
              # Usage:
         | 
| 6 | 
            +
              #  BatchRequest.new(urls).process #=> array of Responses
         | 
| 7 | 
            +
              #
         | 
| 3 8 | 
             
              class BatchRequest
         | 
| 4 9 |  | 
| 5 10 | 
             
                attr_reader :urls, :responses, :requests
         | 
| 6 11 | 
             
                attr_writer :requests
         | 
| 7 | 
            -
             | 
| 12 | 
            +
             | 
| 8 13 | 
             
                include Enumerable
         | 
| 9 14 |  | 
| 10 15 | 
             
                def initialize(*urls)
         | 
| @@ -19,10 +24,11 @@ module WebCrawler | |
| 19 24 | 
             
                  if @handler
         | 
| 20 25 | 
             
                    block_given? ? yield(@handler.process) : @handler.process
         | 
| 21 26 | 
             
                  else
         | 
| 22 | 
            -
                     | 
| 27 | 
            +
                    ready = requests.select{|r| r.ready? }
         | 
| 28 | 
            +
                    @responses ||= Parallel.map(requests - ready) do |req|
         | 
| 23 29 | 
             
                      WebCrawler.logger.info "start request to #{req.url.to_s}"
         | 
| 24 30 | 
             
                      block_given? ? yield(req.process) : req.process
         | 
| 25 | 
            -
                    end
         | 
| 31 | 
            +
                    end.compact + ready.map(&:process)
         | 
| 26 32 | 
             
                  end
         | 
| 27 33 | 
             
                end
         | 
| 28 34 |  | 
| @@ -57,7 +63,7 @@ module WebCrawler | |
| 57 63 | 
             
                end
         | 
| 58 64 |  | 
| 59 65 | 
             
                def request_class
         | 
| 60 | 
            -
                   | 
| 66 | 
            +
                  !@options[:no_cached] && WebCrawler.config.cache.adapter.is_a?(WebCrawler::CacheAdapter::Base) ? CachedRequest : Request
         | 
| 61 67 | 
             
                end
         | 
| 62 68 | 
             
              end
         | 
| 63 69 |  | 
| @@ -5,23 +5,32 @@ module WebCrawler | |
| 5 5 |  | 
| 6 6 | 
             
                def initialize(url, options = { })
         | 
| 7 7 | 
             
                  super(url)
         | 
| 8 | 
            -
                  @cache = options[:cache] || WebCrawler.config. | 
| 8 | 
            +
                  @cache = options[:cache] || WebCrawler.config.cache.adapter
         | 
| 9 | 
            +
                  @ready = true if @cache.exist? url
         | 
| 9 10 | 
             
                end
         | 
| 10 11 |  | 
| 11 12 | 
             
                def process
         | 
| 12 | 
            -
                  cached do
         | 
| 13 | 
            +
                  @response || cached do
         | 
| 13 14 | 
             
                    Response.new *fetch(url)
         | 
| 14 15 | 
             
                  end
         | 
| 15 16 | 
             
                end
         | 
| 16 17 |  | 
| 17 18 | 
             
                protected
         | 
| 18 19 |  | 
| 20 | 
            +
                def load_response
         | 
| 21 | 
            +
                  @response = @cache.get url
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                def put_response(response)
         | 
| 25 | 
            +
                  @response = @cache.put(response)
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 19 28 | 
             
                def cached
         | 
| 20 | 
            -
                   | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 29 | 
            +
                  if @cache.exist? url
         | 
| 30 | 
            +
                    load_response
         | 
| 31 | 
            +
                  else
         | 
| 32 | 
            +
                    put_response(yield)
         | 
| 33 | 
            +
                  end
         | 
| 25 34 | 
             
                  @response
         | 
| 26 35 | 
             
                end
         | 
| 27 36 |  | 
| @@ -3,8 +3,8 @@ require "logger" | |
| 3 3 | 
             
            module WebCrawler
         | 
| 4 4 | 
             
              class BaseConfiguration
         | 
| 5 5 |  | 
| 6 | 
            -
                def initialize(options = {})
         | 
| 7 | 
            -
                  @@options ||= {}
         | 
| 6 | 
            +
                def initialize(options = { })
         | 
| 7 | 
            +
                  @@options ||= { }
         | 
| 8 8 | 
             
                  @@options.merge! options
         | 
| 9 9 | 
             
                end
         | 
| 10 10 |  | 
| @@ -15,7 +15,7 @@ module WebCrawler | |
| 15 15 | 
             
                def config
         | 
| 16 16 | 
             
                  self
         | 
| 17 17 | 
             
                end
         | 
| 18 | 
            -
             | 
| 18 | 
            +
             | 
| 19 19 | 
             
                private
         | 
| 20 20 |  | 
| 21 21 | 
             
                def method_missing(name, *args, &blk)
         | 
| @@ -48,7 +48,7 @@ module WebCrawler | |
| 48 48 | 
             
                end
         | 
| 49 49 |  | 
| 50 50 | 
             
                def cache(&block)
         | 
| 51 | 
            -
                  @cache ||= BaseConfiguration.new | 
| 51 | 
            +
                  @cache ||= BaseConfiguration.new(expire_within: 60, adapter: self.cache_adapter)
         | 
| 52 52 | 
             
                  if block_given?
         | 
| 53 53 | 
             
                    @cache.instance_eval(block)
         | 
| 54 54 | 
             
                  else
         | 
| @@ -58,7 +58,7 @@ module WebCrawler | |
| 58 58 |  | 
| 59 59 | 
             
                def logger
         | 
| 60 60 | 
             
                  @logger ||= Logger.new(STDOUT).tap do |log|
         | 
| 61 | 
            -
             | 
| 61 | 
            +
                    log.level = Logger.const_get log_level.to_s.upcase
         | 
| 62 62 | 
             
                  end
         | 
| 63 63 | 
             
                end
         | 
| 64 64 |  | 
| @@ -1,4 +1,13 @@ | |
| 1 1 | 
             
            module WebCrawler
         | 
| 2 | 
            +
              #
         | 
| 3 | 
            +
              # p = FactoryUrl.new "http://www.somehost.com/:second/:first/", :first => 0..10, :second => "a".."z"
         | 
| 4 | 
            +
              # p.urls #=> ["http://www.somehost.com/a/1",
         | 
| 5 | 
            +
              #        #    "http://www.somehost.com/b/1",
         | 
| 6 | 
            +
              #        #    "http://www.somehost.com/c/1",
         | 
| 7 | 
            +
              #        #    ...
         | 
| 8 | 
            +
              #        #    "http://www.somehost.com/x/10",
         | 
| 9 | 
            +
              #        #    "http://www.somehost.com/y/10",
         | 
| 10 | 
            +
              #        #    "http://www.somehost.com/z/10/"]
         | 
| 2 11 | 
             
              #
         | 
| 3 12 | 
             
              # p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
         | 
| 4 13 | 
             
              # p.urls #=> ["http://www.somehost.com/1/a?param=3",
         | 
| @@ -8,6 +17,7 @@ module WebCrawler | |
| 8 17 | 
             
              #        #    "http://www.somehost.com/10/x?param=34",
         | 
| 9 18 | 
             
              #        #    "http://www.somehost.com/10/y?param=876",
         | 
| 10 19 | 
             
              #        #    "http://www.somehost.com/10/z?param=92"]
         | 
| 20 | 
            +
              #
         | 
| 11 21 | 
             
              # p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
         | 
| 12 22 | 
             
              #   "http://www.somehost.com/#{first}/#{second}?param=#{third}"
         | 
| 13 23 | 
             
              # end
         | 
| @@ -18,21 +28,31 @@ module WebCrawler | |
| 18 28 | 
             
                attr_reader :urls, :params, :pattern
         | 
| 19 29 |  | 
| 20 30 | 
             
                def initialize(*args, &block)
         | 
| 31 | 
            +
                  @options = args.extract_options!
         | 
| 21 32 | 
             
                  if block_given?
         | 
| 22 33 | 
             
                    @block = block
         | 
| 23 34 | 
             
                  else
         | 
| 24 35 | 
             
                    @pattern = args.shift
         | 
| 25 36 | 
             
                    raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
         | 
| 26 37 | 
             
                  end
         | 
| 27 | 
            -
                   | 
| 38 | 
            +
                  
         | 
| 39 | 
            +
                  if @options.empty?
         | 
| 40 | 
            +
                    @params = normalize_arguments(args)
         | 
| 41 | 
            +
                  else
         | 
| 42 | 
            +
                    values, keys = @options.values.map(&:to_a), @options.keys
         | 
| 43 | 
            +
                    values = values.shift.product(*values)
         | 
| 44 | 
            +
                    @params = values.map{|a| Hash[keys.zip(a)]}
         | 
| 45 | 
            +
                  end
         | 
| 28 46 | 
             
                end
         | 
| 29 47 |  | 
| 30 48 | 
             
                def factory
         | 
| 31 | 
            -
                  if pattern
         | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 49 | 
            +
                  @urls ||= if pattern && params.first.is_a?(Hash)
         | 
| 50 | 
            +
                              params.map { |opts| pattern.gsub(/:([a-z_]+)/) { opts[$1.to_sym] } }
         | 
| 51 | 
            +
                            elsif pattern
         | 
| 52 | 
            +
                              params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
         | 
| 53 | 
            +
                            else
         | 
| 54 | 
            +
                              params.map { |opts| @block.call *opts }
         | 
| 55 | 
            +
                            end
         | 
| 36 56 | 
             
                end
         | 
| 37 57 |  | 
| 38 58 | 
             
                def each
         | 
| @@ -46,7 +66,7 @@ module WebCrawler | |
| 46 66 | 
             
                def normalize_arguments(args)
         | 
| 47 67 | 
             
                  args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
         | 
| 48 68 | 
             
                  args.shift if args.first.is_a? String
         | 
| 49 | 
            -
                  params | 
| 69 | 
            +
                  params  = args.map { |arg| convert_to_a(arg) }
         | 
| 50 70 | 
             
                  @params = params.shift.product(*params)
         | 
| 51 71 | 
             
                end
         | 
| 52 72 |  | 
    
        data/lib/web_crawler/follower.rb
    CHANGED
    
    | @@ -1,26 +1,28 @@ | |
| 1 1 | 
             
            class WebCrawler::Follower
         | 
| 2 2 |  | 
| 3 | 
            +
              attr_reader :options
         | 
| 4 | 
            +
             | 
| 3 5 | 
             
              def initialize(*responses)
         | 
| 4 | 
            -
                @options   = responses. | 
| 6 | 
            +
                @options   = responses.extract_options!
         | 
| 5 7 | 
             
                @responses = responses.flatten
         | 
| 6 8 | 
             
              end
         | 
| 7 9 |  | 
| 8 | 
            -
              def process(options = {})
         | 
| 10 | 
            +
              def process(options = { })
         | 
| 9 11 | 
             
                WebCrawler::BatchRequest.new(collect, options).process
         | 
| 10 12 | 
             
              end
         | 
| 11 13 |  | 
| 12 14 | 
             
              def follow(response)
         | 
| 13 | 
            -
                @responses  | 
| 15 | 
            +
                @responses += Array.wrap(response)
         | 
| 14 16 | 
             
                self
         | 
| 15 17 | 
             
              end
         | 
| 16 18 |  | 
| 17 | 
            -
              def collect
         | 
| 18 | 
            -
                @responses.map do |response|
         | 
| 19 | 
            +
              def collect(&block)
         | 
| 20 | 
            +
                urls = @responses.map do |response|
         | 
| 19 21 | 
             
                  parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
         | 
| 20 | 
            -
                  parser.parse(response.body) | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
                 | 
| 22 | 
            +
                  parser.parse(response.body, &block)
         | 
| 23 | 
            +
                end.flatten
         | 
| 24 | 
            +
                urls = urls.select { |url| url =~ @options[:only] } if @options[:only]
         | 
| 25 | 
            +
                urls
         | 
| 24 26 | 
             
              end
         | 
| 25 27 |  | 
| 26 28 | 
             
            end
         |