RubyGems - web_crawler - Versions diffs - 0.2.0 - Mend

web_crawler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

data/.gitignore +5 -0
data/.rspec +1 -0
data/Gemfile +11 -0
data/README +1 -0
data/Rakefile +2 -0
data/bin/wcrawler +13 -0
data/lib/ext/array.rb +100 -0
data/lib/ext/hash.rb +45 -0
data/lib/ext/http_response.rb +19 -0
data/lib/web_crawler/application.rb +49 -0
data/lib/web_crawler/batch_request.rb +63 -0
data/lib/web_crawler/cache_adapter/base.rb +33 -0
data/lib/web_crawler/cache_adapter/file.rb +52 -0
data/lib/web_crawler/cache_adapter/memory.rb +23 -0
data/lib/web_crawler/cache_adapter.rb +11 -0
data/lib/web_crawler/cached_request.rb +30 -0
data/lib/web_crawler/cli/thor_hooks.rb +94 -0
data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
data/lib/web_crawler/cli.rb +122 -0
data/lib/web_crawler/configuration.rb +87 -0
data/lib/web_crawler/factory_url.rb +58 -0
data/lib/web_crawler/follower.rb +26 -0
data/lib/web_crawler/handler.rb +45 -0
data/lib/web_crawler/parsers/url.rb +52 -0
data/lib/web_crawler/parsers.rb +5 -0
data/lib/web_crawler/request.rb +59 -0
data/lib/web_crawler/response.rb +45 -0
data/lib/web_crawler/utility.rb +65 -0
data/lib/web_crawler/version.rb +9 -0
data/lib/web_crawler/view/csv.rb +20 -0
data/lib/web_crawler/view/json.rb +9 -0
data/lib/web_crawler/view/plain.rb +9 -0
data/lib/web_crawler/view/runner.rb +20 -0
data/lib/web_crawler/view/table.rb +69 -0
data/lib/web_crawler/view/xml.rb +38 -0
data/lib/web_crawler/view.rb +44 -0
data/lib/web_crawler.rb +38 -0
data/spec/fake_web_generator.rb +44 -0
data/spec/spec_helper.rb +17 -0
data/spec/web_crawler/batch_request_spec.rb +45 -0
data/spec/web_crawler/cached_request_spec.rb +31 -0
data/spec/web_crawler/factory_url_spec.rb +34 -0
data/spec/web_crawler/follow_spec.rb +32 -0
data/spec/web_crawler/request_spec.rb +29 -0
data/spec/web_crawler/response_spec.rb +27 -0
data/spec/web_crawler/url_parser_spec.rb +41 -0
data/spec/web_crawler/view_spec.rb +95 -0
data/web_crawler.gemspec +30 -0
metadata +151 -0

data/lib/web_crawler/cli.rb ADDED Viewed

@@ -0,0 +1,122 @@
+require 'thor'
+require 'thor/actions'
+require 'pathname'
+require 'web_crawler/cli/thor_hooks'
+require 'web_crawler/cli/thor_inherited_options'
+module WebCrawler
+  class CLI < Thor
+    include Thor::Actions
+    include Thor::Hooks
+    include Thor::InheritedOptions
+    default_task :help
+    class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
+    class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
+    class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
+    class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
+    class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
+    class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
+    class_option :follow, type: :boolean, desc: "follow to urls on the pages"
+    class_option :run, type: :string, desc: "run custom script with api access"
+    class_option :log, type: :string, desc: "log file path"
+    before_action except: :help do
+      @options = options.dup
+      @options[:format] = 'json' if options[:json]
+      @options[:format] = 'xml' if options[:xml]
+      @options[:format] = 'csv' if options[:csv]
+      @options[:format] = 'table' if options[:table]
+      @options[:format] = 'plain' if options[:plain]
+      @options[:original_format] = @options[:format] if options[:run]
+      @options[:format] = 'runner' if options[:run]
+      WebCrawler.config.logger = Logger.new(@options['log']) if @options['log']
+      WebCrawler.config.logger.level           = Logger::DEBUG
+      WebCrawler.config.logger.datetime_format = "%d-%m-%Y %H:%M:%S"
+      WebCrawler.config.severity_colors        = { 'DEBUG' => :magenta,
+                                                   'INFO'  => :green,
+                                                   'WARN'  => :blue,
+                                                   'ERROR' => :red,
+                                                   'FATAL' => :red,
+                                                   'ANY'   => :yellow }
+      WebCrawler.config.logger.formatter = proc { |severity, datetime, _, msg|
+        color = WebCrawler.config.severity_colors[severity]
+        send(color, ("[#{severity}] ").ljust(8)) << "[#{datetime}] " << "pid #{$$} " << "-- #{msg}\n"
+      }
+    end
+    render except: :help do |response, options|
+      WebCrawler::View.factory(options[:format], response, options).draw
+    end
+    def help(task = nil)
+      if task
+        self.class.task_help(shell, task)
+      else
+        self.class.help shell
+      end
+    end
+    protected
+    def color(text, color_code)
+      "#{color_code}#{text}\e[0m"
+    end
+    def bold(text)
+      color(text, "\e[1m")
+    end
+    def white(text)
+      color(text, "\e[37m")
+    end
+    def green(text)
+      color(text, "\e[32m")
+    end
+    def red(text)
+      color(text, "\e[31m")
+    end
+    def magenta(text)
+      color(text, "\e[35m")
+    end
+    def yellow(text)
+      color(text, "\e[33m")
+    end
+    def blue(text)
+      color(text, "\e[34m")
+    end
+    def grey(text)
+      color(text, "\e[90m")
+    end
+    def short_padding
+      '  '
+    end
+    def long_padding
+      '     '
+    end
+    def logger
+      WebCrawler.logger
+    end
+    def symbolized_options
+      @symbolized_options ||= Hash[@options.keys.zip(@options.values)].symbolize_keys
+    end
+  end
+end

data/lib/web_crawler/configuration.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require "logger"
+module WebCrawler
+  class BaseConfiguration
+    def initialize(options = {})
+      @@options ||= {}
+      @@options.merge! options
+    end
+    def respond_to?(name)
+      super || @@options.key?(name.to_sym)
+    end
+    def config
+      self
+    end
+    private
+    def method_missing(name, *args, &blk)
+      if name.to_s =~ /=$/
+        @@options[$`.to_sym] = args.first
+      elsif @@options.key?(name)
+        @@options[name]
+      else
+        super
+      end
+    end
+  end
+  class Configuration < BaseConfiguration
+    attr_accessor :cache_adapter, :log_level, :logger, :root, :cache
+    def root
+      @root ||= FileUtils.pwd
+    end
+    def cache_adapter
+      @cache_adapter ||= begin
+        if File.exist?("#{root}/tmp/cache/")
+          WebCrawler::CacheAdapter::File.new "#{root}/tmp/cache/"
+        else
+          WebCrawler::CacheAdapter::Memory.new
+        end
+      end
+    end
+    def cache(&block)
+      @cache ||= BaseConfiguration.new expire_within: 60
+      if block_given?
+        @cache.instance_eval(block)
+      else
+        @cache
+      end
+    end
+    def logger
+      @logger ||= Logger.new(STDOUT).tap do |log|
+       log.level = Logger.const_get log_level.to_s.upcase
+      end
+    end
+    def log_level
+      @log_level ||= :debug
+    end
+  end
+  module Configurable
+    def self.included(base)
+      base.extend ClassMethods
+    end
+    module ClassMethods
+      def configure(&block)
+        module_eval(&block)
+      end
+      def config
+        @config ||= Configuration.new
+      end
+    end
+  end
+end

data/lib/web_crawler/factory_url.rb ADDED Viewed

@@ -0,0 +1,58 @@
+module WebCrawler
+  #
+  # p = FactoryUrl.new "http://www.somehost.com/$1/$2?param=$3", 0..10, "a".."z", [3,7,34,876,92]
+  # p.urls #=> ["http://www.somehost.com/1/a?param=3",
+  #        #    "http://www.somehost.com/1/b?param=7",
+  #        #    "http://www.somehost.com/1/c?param=34",
+  #        #    ...
+  #        #    "http://www.somehost.com/10/x?param=34",
+  #        #    "http://www.somehost.com/10/y?param=876",
+  #        #    "http://www.somehost.com/10/z?param=92"]
+  # p = FactoryUrl.new 0..10, "a".."z", [3,7,34,876,92] do |first, second, third|
+  #   "http://www.somehost.com/#{first}/#{second}?param=#{third}"
+  # end
+  #
+  class FactoryUrl
+    include Enumerable
+    attr_reader :urls, :params, :pattern
+    def initialize(*args, &block)
+      if block_given?
+        @block = block
+      else
+        @pattern = args.shift
+        raise ArgumentError, "first argument must be an url pattern(String)" unless pattern.is_a? String
+      end
+      @params = normalize_arguments(args)
+    end
+    def factory
+      if pattern
+        @urls ||= params.map { |opts| pattern.gsub(/\$(\d+)/) { opts[$1.to_i - 1] } }
+      else
+        @urls ||= params.map { |opts| @block.call *opts }
+      end
+    end
+    def each
+      @urls = nil
+      factory.each do |url|
+        yield url
+      end
+    end
+    protected
+    def normalize_arguments(args)
+      args = args.first if args.size == 1 && args.first.is_a?(Enumerable)
+      args.shift if args.first.is_a? String
+      params = args.map { |arg| convert_to_a(arg) }
+      @params = params.shift.product(*params)
+    end
+    def convert_to_a(arg)
+      arg = arg.to_a rescue arg
+      [*arg]
+    end
+  end
+end

data/lib/web_crawler/follower.rb ADDED Viewed

@@ -0,0 +1,26 @@
+class WebCrawler::Follower
+  def initialize(*responses)
+    @options   = responses.last.is_a?(Hash) ? responses.pop : {}
+    @responses = responses.flatten
+  end
+  def process(options = {})
+    WebCrawler::BatchRequest.new(collect, options).process
+  end
+  def follow(response)
+    @responses << response
+    self
+  end
+  def collect
+    @responses.map do |response|
+      parser = WebCrawler::Parsers::Url.new(response.url.host, url: response.url.request_uri, same_host: @options[:same_host])
+      parser.parse(response.body) do |url|
+        url
+      end
+    end
+  end
+end

data/lib/web_crawler/handler.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module WebCrawler
+  class Handler
+    def initialize(*responses_or_request, &handler)
+      @handler = handler
+      if responses_or_request.first.is_a?(BatchRequest)
+        @target = responses_or_request.first
+      else
+        @target = responses_or_request.flatten
+      end
+    end
+    def process
+      @result ||= @target.map do |response|
+        @handler.call(response, @target)
+      end
+    end
+  end
+  class HandlerParser < Handler
+    def initialize(parser, *responses_or_request)
+      @parser = load_parser(parser)
+      super(*responses_or_request, &lambda { |response,*| @parser.parse(response) })
+    end
+    protected
+    def load_parser(parser)
+      case parser
+        when String
+          Object.const_get parser
+        else
+          parser.respond_to?(:parse) ? parser : raise(LoadParserError, 'Parser must respond to :parse')
+      end
+    rescue NameError
+      $:.unshift File.expand_path('./')
+      require WebCrawler.underscore(parser)
+      retry
+    end
+  end
+end

data/lib/web_crawler/parsers/url.rb ADDED Viewed

@@ -0,0 +1,52 @@
+class WebCrawler::Parsers::Url
+  attr_reader :host, :scheme
+  def initialize(host, options = { })
+    @scheme  = options[:secure] ? 'https' : 'http'
+    @host    = URI.parse(normalize_host(host.to_s))
+    @scheme  = @host.scheme
+    @options = options
+    set_current_page
+  end
+  def parse(response, &filter)
+    (Hpricot(response.to_s) / "a").map do |a|
+      normalize(a["href"]).tap do |url|
+        url = filter.call(url) if url && filter
+      end
+    end.compact.uniq
+  end
+  def normalize(url)
+    if url[/^(:?#{@host.scheme}|https|)\:\/\/#{@host.host}/]
+      normalize_host(url)
+    elsif url == '#'
+      nil
+    else
+      (url[0] == '/' || url[0] == '?' || url[0] == '#') ? join(url).to_s : (@options[:same_host] ? nil : url)
+    end
+  end
+  protected
+  def set_current_page
+    @current_url = join(@options[:page] || @options[:url] || @options[:path] || '/', @host)
+  end
+  def normalize_host(host, scheme = @scheme)
+    host[0..3] == 'http' ? host : "#{scheme}://" + host
+  end
+  def join(request_uri, host = @current_url)
+    return host.dup unless request_uri
+    host.dup.tap do |u|
+      path_with_query, anchor = request_uri.split('#')
+      path, query = path_with_query.split('?')
+      u.send(:set_fragment, anchor)
+      u.send(:set_query, query) if query && !query.empty?
+      u.send(:set_path, path) if path && !path.empty?
+    end
+  end
+end

data/lib/web_crawler/parsers.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require "hpricot"
+module WebCrawler::Parsers
+  autoload :Url, 'web_crawler/parsers/url'
+end

data/lib/web_crawler/request.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module WebCrawler
+  class Request
+    HEADERS = {
+        'User-Agent'      => 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
+        'Accept'          => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language' => 'en-us,en;q=0.5',
+        'Accept-Charset'  => 'utf-8;windows-1251;q=0.7,*;q=0.7',
+        'Cache-Control'   => 'max-age=0'
+    }
+    attr_reader :url, :response
+    def initialize(url)
+      @url, @request = normalize_url(url), { }
+      @headers = HEADERS.dup
+    end
+    def process
+      @response = Response.new *fetch(url)
+    end
+    def inspect
+      "#<#{self.class}:#{self.object_id} @url=\"#{@url.to_s}\">"
+    end
+    protected
+    def request_for(host, port=nil)
+      @request[[host, port]] = Net::HTTP.new(host, port) #.tap { |http| http.set_debug_output(STDERR) }
+    end
+    def normalize_url(url)
+      URI.parse(url.index("http") == 0 ? url : "http://" + url)
+    rescue URI::Error
+      WebCrawler.logger.debug "#{url} bad URI(is not URI?)"
+    end
+    def fetch(uri, limit = 3, redirect_path = nil)
+      raise ArgumentError, "HTTP redirect too deep. #{redirected_from} => #{uri}" if limit <= 0
+      response = request_for(uri.host, uri.port).get(uri.request_uri, headers)
+      case response
+        when Net::HTTPRedirection then
+          @headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie']
+          fetch(normalize_url(response['location']), limit - 1, [redirect_path, uri])
+        else
+          response.redirect_path = redirect_path if redirect_path
+          [uri, response]
+      end
+    end
+    def headers
+      @headers
+    end
+  end
+end

data/lib/web_crawler/response.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module WebCrawler
+  class Response
+    extend ::Forwardable
+    delegate [:body, :http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
+    attr_reader :url, :expire, :date, :cached
+    def initialize(url, response)
+      raise ArgumentError, "response must be a Net::HTTPResponse, but #{response.class} given" unless response.is_a? Net::HTTPResponse
+      @url, @response = url, response
+      @date = Time.parse(self['Date']) rescue Time.now
+      @expire ||= Time.parse(self['Expires']) rescue Time.now
+    end
+    def set_cached_flag
+      @cached = ' CACHED'
+    end
+    def foul?
+      date >= expire
+    end
+    def success?
+      @response.is_a? Net::HTTPSuccess
+    end
+    def failure?
+      !success?
+    end
+    def inspect
+      redirected = redirect? ? " redirect path: \"" + redirect_path.join(', ') + "\"" : ""
+      "#<#{self.class}::0x#{self.object_id.to_s(16).rjust(14, '0')}#{@cached} " <<
+          "#{type} #{code} #{message} #{@url}" <<
+          "#{redirected}>"
+    end
+    alias :to_s :body
+    def type
+      @response.class
+    end
+  end
+end

data/lib/web_crawler/utility.rb ADDED Viewed

@@ -0,0 +1,65 @@
+module WebCrawler::Utility
+  extend self
+  # By default, +camelize+ converts strings to UpperCamelCase. If the argument to +camelize+
+  # is set to <tt>:lower</tt> then +camelize+ produces lowerCamelCase.
+  #
+  # +camelize+ will also convert '/' to '::' which is useful for converting paths to namespaces.
+  #
+  # Examples:
+  #   "active_record".camelize                # => "ActiveRecord"
+  #   "active_record".camelize(:lower)        # => "activeRecord"
+  #   "active_record/errors".camelize         # => "ActiveRecord::Errors"
+  #   "active_record/errors".camelize(:lower) # => "activeRecord::Errors"
+  #
+  # As a rule of thumb you can think of +camelize+ as the inverse of +underscore+,
+  # though there are cases where that does not hold:
+  #
+  #   "SSLError".underscore.camelize # => "SslError"
+  def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
+    if first_letter_in_uppercase
+      lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
+    else
+      lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
+    end
+  end
+  # Makes an underscored, lowercase form from the expression in the string.
+  #
+  # Changes '::' to '/' to convert namespaces to paths.
+  #
+  # Examples:
+  #   "ActiveRecord".underscore         # => "active_record"
+  #   "ActiveRecord::Errors".underscore # => active_record/errors
+  #
+  # As a rule of thumb you can think of +underscore+ as the inverse of +camelize+,
+  # though there are cases where that does not hold:
+  #
+  #   "SSLError".underscore.camelize # => "SslError"
+  def underscore(camel_cased_word)
+    word = camel_cased_word.to_s.dup
+    word.gsub!(/::/, '/')
+    word.gsub!(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
+    word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
+    word.tr!("-", "_")
+    word.downcase!
+    word
+  end
+  # Replaces underscores with dashes in the string.
+  #
+  # Example:
+  #   "puni_puni" # => "puni-puni"
+  def dasherize(underscored_word)
+    underscored_word.gsub(/_/, '-')
+  end
+  # Removes the module part from the expression in the string.
+  #
+  # Examples:
+  #   "ActiveRecord::CoreExtensions::String::Inflections".demodulize # => "Inflections"
+  #   "Inflections".demodulize                                       # => "Inflections"
+  def demodulize(class_name_in_module)
+    class_name_in_module.to_s.gsub(/^.*::/, '')
+  end
+end

data/lib/web_crawler/version.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module WebCrawler
+  module VERSION
+    MAJOR  = 0
+    MINOR  = 2
+    TINY   = 0
+    STRING = [MAJOR, MINOR, TINY].join('.')
+  end
+end

data/lib/web_crawler/view/csv.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'csv'
+module WebCrawler::View
+  class Csv < Base
+    def initialize(input, options = { })
+      in_group_of_num = options.delete(:in_group_of)
+      input = input.first.in_groups_of(in_group_of_num) if in_group_of_num && input.size == 1
+      headers = options.delete(:headers) || input.select { |i| i.is_a? Hash }.max_by(&:size).keys
+    rescue NoMethodError
+    ensure
+      input = input.dup.unshift(headers) unless headers.nil?
+      super(input, options)
+    end
+    def format(item)
+      values = item.respond_to?(:values) ? item.values : item.to_a
+      values.to_csv(@options)
+    end
+  end
+end

data/lib/web_crawler/view/json.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'json'
+module WebCrawler::View
+  class Json < Base
+    def render
+      {responses: input}.to_json
+    end
+  end
+end

data/lib/web_crawler/view/plain.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module WebCrawler::View
+  class Plain < Base
+    def render
+      [*input].join "\n"
+    end
+  end
+end

data/lib/web_crawler/view/runner.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "fileutils"
+module WebCrawler::View
+  class Runner < Base
+    module Space
+      extend self
+      attr_accessor :responses
+    end
+    def render
+      unless File.exists? @options['run']
+        @options['run'] = File.expand_path @options['run'], FileUtils.pwd
+      end
+      Space.responses = input.freeze
+      Space.module_eval(File.open(@options['run'], 'r').read)
+    end
+  end
+end