RubyGems - web_crawler - Versions diffs - 0.2.0 - Mend

web_crawler 0.2.0

Files changed (49) hide show

data/.gitignore +5 -0
data/.rspec +1 -0
data/Gemfile +11 -0
data/README +1 -0
data/Rakefile +2 -0
data/bin/wcrawler +13 -0
data/lib/ext/array.rb +100 -0
data/lib/ext/hash.rb +45 -0
data/lib/ext/http_response.rb +19 -0
data/lib/web_crawler/application.rb +49 -0
data/lib/web_crawler/batch_request.rb +63 -0
data/lib/web_crawler/cache_adapter/base.rb +33 -0
data/lib/web_crawler/cache_adapter/file.rb +52 -0
data/lib/web_crawler/cache_adapter/memory.rb +23 -0
data/lib/web_crawler/cache_adapter.rb +11 -0
data/lib/web_crawler/cached_request.rb +30 -0
data/lib/web_crawler/cli/thor_hooks.rb +94 -0
data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
data/lib/web_crawler/cli.rb +122 -0
data/lib/web_crawler/configuration.rb +87 -0
data/lib/web_crawler/factory_url.rb +58 -0
data/lib/web_crawler/follower.rb +26 -0
data/lib/web_crawler/handler.rb +45 -0
data/lib/web_crawler/parsers/url.rb +52 -0
data/lib/web_crawler/parsers.rb +5 -0
data/lib/web_crawler/request.rb +59 -0
data/lib/web_crawler/response.rb +45 -0
data/lib/web_crawler/utility.rb +65 -0
data/lib/web_crawler/version.rb +9 -0
data/lib/web_crawler/view/csv.rb +20 -0
data/lib/web_crawler/view/json.rb +9 -0
data/lib/web_crawler/view/plain.rb +9 -0
data/lib/web_crawler/view/runner.rb +20 -0
data/lib/web_crawler/view/table.rb +69 -0
data/lib/web_crawler/view/xml.rb +38 -0
data/lib/web_crawler/view.rb +44 -0
data/lib/web_crawler.rb +38 -0
data/spec/fake_web_generator.rb +44 -0
data/spec/spec_helper.rb +17 -0
data/spec/web_crawler/batch_request_spec.rb +45 -0
data/spec/web_crawler/cached_request_spec.rb +31 -0
data/spec/web_crawler/factory_url_spec.rb +34 -0
data/spec/web_crawler/follow_spec.rb +32 -0
data/spec/web_crawler/request_spec.rb +29 -0
data/spec/web_crawler/response_spec.rb +27 -0
data/spec/web_crawler/url_parser_spec.rb +41 -0
data/spec/web_crawler/view_spec.rb +95 -0
data/web_crawler.gemspec +30 -0
metadata +151 -0

data/lib/web_crawler/view/table.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require 'csv'
+module WebCrawler::View
+  # Render a table.
+  #
+  # ==== Parameters
+  # Array[Array[String, String, ...]]
+  #
+  # ==== Options
+  # ident<Integer>:: Indent the first column by ident value.
+  # colwidth<Integer>:: Force the first column to colwidth spaces wide.
+  #
+  class Table < Base
+    def render
+      format_table(@input)
+    end
+    protected
+    def format_table(table)
+      return if table.empty?
+      formats, ident, colwidth = [], @options[:ident].to_i, @options[:colwidth]
+      @options[:truncate] = terminal_width if @options[:truncate] == true
+      formats << "%-#{colwidth + 2}s" if colwidth
+      start = colwidth ? 1 : 0
+      start.upto(table.first.length - 2) do |i|
+        maxima ||= table.max { |a, b| a[i].size <=> b[i].size }[i].size
+        formats << "%-#{maxima + 2}s"
+      end
+      formats[0] = formats[0].insert(0, " " * ident)
+      formats << "%s"
+      table.map do |row|
+        sentence = ""
+        row.each_with_index do |column, i|
+          sentence << formats[i] % column.to_s
+        end
+        sentence = truncate(sentence, @options[:truncate]) if @options[:truncate]
+        sentence
+      end.join "\n"
+    end
+    def terminal_width
+      if ENV['THOR_COLUMNS']
+        result = ENV['THOR_COLUMNS'].to_i
+      else
+        result = unix? ? dynamic_width : 80
+      end
+      (result < 10) ? 80 : result
+    rescue
+      80
+    end
+    def truncate(string, width)
+      if string.length <= width
+        string
+      else
+        (string[0, width-3] || "") + "..."
+      end
+    end
+  end
+end

data/lib/web_crawler/view/xml.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module WebCrawler::View
+  class Xml < Base
+    self.default_options = { pretty: true }
+    def render
+      @options[:headers] ||= input.max_by(&:size).each_with_index.map { |_, index| "field_#{index+1}" }
+      "<responses>#{pretty}#{super}</responses>"
+    end
+    def format(item)
+      response_tag item.is_a?(Hash) ? item : Hash[@options[:headers].zip item]
+    end
+    protected
+    def response_tag(hash)
+      tag(:response) do
+        hash.map do |tag, value|
+          "<#{tag}>#{value}</#{tag}>"
+        end.join
+      end + pretty
+    end
+    def pretty
+      @options[:pretty] ? "\n" : ""
+    end
+    def tag(name, value="", &block)
+      value << block.call if block_given?
+      unless value.empty?
+        "<#{name}>#{value}</#{name}>"
+      else
+        "<#{name}/>"
+      end
+    end
+  end
+end

data/lib/web_crawler/view.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module WebCrawler::View
+  autoload :Csv, 'web_crawler/view/csv'
+  autoload :Json, 'web_crawler/view/json'
+  autoload :Xml, 'web_crawler/view/xml'
+  autoload :Plain, 'web_crawler/view/plain'
+  autoload :Table, 'web_crawler/view/table'
+  autoload :Runner, 'web_crawler/view/runner'
+  extend self
+  def factory(type, *args, &block)
+    const_get(WebCrawler::Utility.camelize(type).to_sym).new(*args, &block)
+  end
+  class Base
+    attr_reader :input
+    class << self
+      attr_accessor :default_options
+      def default_options
+        @default_options ||= { }
+      end
+    end
+    def initialize(input, options = { })
+      @options = self.class.default_options.merge (options || { })
+      @input   = input
+    end
+    def render
+      [*input].map { |i| format(i) }.join
+    end
+    def draw(output=$stdout)
+      output.puts render
+    end
+    def format(item)
+      item
+    end
+  end
+end

data/lib/web_crawler.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require "net/http"
+require "net/https"
+require 'uri'
+require 'forwardable'
+require "ext/hash"
+require "ext/array"
+require "ext/http_response"
+module WebCrawler
+  autoload :Request, 'web_crawler/request'
+  autoload :CachedRequest, 'web_crawler/cached_request'
+  autoload :Response, 'web_crawler/response'
+  autoload :BatchRequest, 'web_crawler/batch_request'
+  autoload :Handler, 'web_crawler/handler'
+  autoload :HandlerParser, 'web_crawler/handler'
+  autoload :CacheAdapter, 'web_crawler/cache_adapter'
+  autoload :Configurable, 'web_crawler/configuration'
+  autoload :Configuration, 'web_crawler/configuration'
+  autoload :FactoryUrl, 'web_crawler/factory_url'
+  autoload :Follower, 'web_crawler/follower'
+  autoload :Parsers, 'web_crawler/parsers'
+  autoload :Utility, 'web_crawler/utility'
+  autoload :View, 'web_crawler/view'
+  autoload :CLI, 'web_crawler/cli'
+  autoload :Application, 'web_crawler/application'
+  include Configurable
+  extend Utility
+  def self.logger
+    config.logger
+  end
+end

data/spec/fake_web_generator.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module FakeWebGenerator
+  def self.included(base)
+    generate_web(['http://otherhost.ru/1',
+                  'http://otherhost.ru/2',
+                  'http://otherhost.ru/3',
+                  'http://example.com/1',
+                  'http://example.com/2',
+                  'http://example.com/3',
+                  'http://example.com/2323.html',
+                  'http://example.com/2323.html?rr=1',
+                  'http://example.com/follower?rr=1'])
+    FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
+  end
+  def generate_web(urls)
+    @@known_web_urls ||= []
+    @@known_web_urls << urls
+    @@known_web_urls.flatten!
+    @@known_web_urls.uniq!
+    urls.each do |url|
+      FakeWeb.register_uri(:get, url, :body => "Example body for url #{url}")
+    end
+  end
+  module_function :generate_web
+  def follower_body
+    "Example body for http://example.com/follower" <<
+    @@known_web_urls.map { |u| "<a href='#{u}'>link text</a>" }.join("\n")
+  end
+  module_function :follower_body
+  def urls_board_path
+    'http://example.com/follower'
+  end
+  module_function :urls_board_path
+  def known_urls
+    @@known_web_urls
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,17 @@
+$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "/../lib")))
+require 'rspec'
+require "web_crawler"
+require "fake_web"
+require 'fake_web_generator'
+RSpec.configure do |c|
+  c.mock_with :rspec
+  c.include FakeWebGenerator
+end
+WebCrawler.configure do
+  config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
+end

data/spec/web_crawler/batch_request_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require "spec_helper"
+FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
+FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
+FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
+describe WebCrawler::BatchRequest do
+  let(:urls) { ['example.com', 'example.com/1', 'example.com/2'] }
+  let(:http_response) { Net::HTTPResponse.new('', '', '') }
+  let(:responses) { urls.map { |url| WebCrawler::Response.new(url, http_response) } }
+  def response(url)
+    WebCrawler::Response.new(url, http_response)
+  end
+  def request(url)
+    WebCrawler::Request.new(url).stub(:process).and_return(response(url))
+  end
+  subject { described_class.new(urls) }
+  it "should initialize batch of requests for given urls" do
+    subject.requests.should be_a Array
+    subject.requests.should have(3).members
+    subject.requests.all? { |r| r.is_a? WebCrawler::Request }.should be_true
+  end
+  it "should process requests" do
+    subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
+    subject.process.should be_a Array
+    subject.process.first.should be_a WebCrawler::Response
+  end
+  it "should accept :parser option with parser class or object" do
+    class ::TestParser
+      def parse(resp)
+        resp.to_s + ' parsed'
+      end
+    end
+    described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
+                                                                         "Example body1 parsed",
+                                                                         "Example body for url http://example.com/2 parsed"]
+  end
+end

data/spec/web_crawler/cached_request_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require "spec_helper"
+FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
+FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
+FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
+FakeWeb.allow_net_connect = false
+describe 'Cached requests' do
+  let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
+  it 'should not send requests to the web if cache exists' do
+    FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
+    first_response = FakeWeb.response_for :get, "http://example.com/1"
+    FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
+    lambda {
+      WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
+    }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
+    FakeWeb.should_not_receive(:response_for)
+    WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
+    cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
+    WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
+  end
+end

data/spec/web_crawler/factory_url_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require "spec_helper"
+describe WebCrawler::FactoryUrl do
+  it "should generate urls with block" do
+    first_param = [1,2,3]
+    second_param = 10...15
+    factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
+      random = rand(3000)
+      "www.example.com/%s/%s.html?rid=#{random}" % args
+    end
+    urls = factory.factory
+    urls.should be_a Array
+    factory.params.size.should == 15
+    urls.should have(factory.params.size).urls
+    urls.first.should =~ /www\.example\.com\/1\/10\.html/
+  end
+  it "should generate urls with pattern" do
+    first_param = [1,2,3]
+    second_param = 10...15
+    factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
+    urls = factory.factory
+    urls.should be_a Array
+    factory.params.size.should == 15
+    urls.should have(factory.params.size).urls
+    urls.first.should == "www.example.com/1/10.html"
+  end
+end

data/spec/web_crawler/follow_spec.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require "spec_helper"
+describe WebCrawler::Follower do
+  it "should collect all uniques urls from responses" do
+    responses = WebCrawler::BatchRequest.new(urls_board_path).process
+    urls      = WebCrawler::Follower.new(responses).collect
+    urls.first.should have(9).urls
+    urls.first.should == known_urls
+  end
+  it "should collect all the unique url with same host like in responses" do
+    responses = WebCrawler::BatchRequest.new(urls_board_path).process
+    urls      = WebCrawler::Follower.new(responses, same_host: true).collect
+    urls.first.should have(6).urls
+    urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
+  end
+  it "should process requests for following urls" do
+    responses = WebCrawler::BatchRequest.new(urls_board_path).process
+    follower  = WebCrawler::Follower.new responses
+    responses += follower.process
+    responses.should have(10).responses
+    responses.first.should be_a WebCrawler::Response
+    responses.first.url.to_s.should == urls_board_path
+    responses.last.url.to_s.should == known_urls.last
+  end
+end

data/spec/web_crawler/request_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require "spec_helper"
+describe WebCrawler::Request do
+  let(:success_url) { 'example.com/success' }
+  let(:failure_url) { 'example.com/failure' }
+  before(:each) do
+    @body = "Example body"
+    FakeWeb.register_uri(:get, "http://example.com/success", :body => @body, :status => ["200", "OK"])
+    FakeWeb.register_uri(:get, "http://example.com/failure", :body => @body, :status => ["503", "Internal error"])
+  end
+  subject { WebCrawler::Request.new(success_url) }
+  it "should fetch the url" do
+    subject.process.should be_a WebCrawler::Response
+    subject.process.body.should be @body
+  end
+  it "should be success" do
+    subject.process.should be_success
+  end
+  it "should be failure" do
+    WebCrawler::Request.new(failure_url).process.should be_failure
+  end
+end

data/spec/web_crawler/response_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require "spec_helper"
+FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
+describe WebCrawler::Response do
+  let(:url) { 'example.com' }
+  subject { WebCrawler::Request.new(url).process }
+  it "should initialize with url and response" do
+    described_class.new url, Net::HTTPResponse.new('', '', '')
+  end
+  it "should respond to HTTPResponse methods" do
+    [:body, :http_version, :code, :message, :msg, :code_type].each do |meth|
+      subject.should respond_to meth
+    end
+  end
+  it "#to_s should be String and equal to #body and not equal to #inspect" do
+    subject.to_s.should be_a String
+    subject.to_s.should be subject.body
+    subject.to_s.should_not be subject.inspect
+  end
+end

data/spec/web_crawler/url_parser_spec.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require "spec_helper"
+describe WebCrawler::Parsers::Url do
+  let(:host) { 'example.com' }
+  let(:http_host) { 'http://example.com' }
+  let(:https_host) { 'https://example.com/' }
+  let(:current_page) { '/news/1000.html' }
+  it "should add scheme to url" do
+    described_class.new(host).host.to_s.should == 'http://example.com'
+    described_class.new(host, secure: true).host.to_s.should == 'https://example.com'
+  end
+  it "should parse scheme from url and set @scheme" do
+    described_class.new(https_host).scheme.should == 'https'
+    described_class.new(host, secure: true).scheme.should == 'https'
+    described_class.new(http_host).scheme.should == 'http'
+    described_class.new(host).scheme.should == 'http'
+  end
+  it "should return nil if host not equal to initial host" do
+    described_class.new(host, same_host: true).normalize('example.ru/news?sid=1').should be_nil
+    described_class.new(host, same_host: true).normalize('http://example.ru/news?sid=1').should be_nil
+    described_class.new(host, same_host: true).normalize('https://example.ru/news?sid=1').should be_nil
+  end
+  it "should join request_uri to initial host" do
+    described_class.new(https_host).normalize('/news').should == 'https://example.com/news'
+    described_class.new(https_host).normalize('/news?sid=1').should == 'https://example.com/news?sid=1'
+    described_class.new(https_host).normalize('/news?sid=1#anchor').should == 'https://example.com/news?sid=1#anchor'
+  end
+  it "should join query string to initial current page" do
+    described_class.new(host, url: current_page).normalize('?sid=1').should == 'http://example.com/news/1000.html?sid=1'
+  end
+  it "should join fragment string to initial current page" do
+    described_class.new(host, url: current_page).normalize('#anchor').should == 'http://example.com/news/1000.html#anchor'
+  end
+end

data/spec/web_crawler/view_spec.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require "spec_helper"
+describe WebCrawler::View::Csv do
+  let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
+  let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n" }] }
+  it "should render input array to csv string" do
+    described_class.new(input).render.should == "1,2,3\nstring,\"other string\n\"\n"
+  end
+  it "should render input hash to csv string" do
+    described_class.new(input_hash).render.should == "title,url,author\n1,2,3\nstring,\"other string\n\"\n"
+  end
+  it "should render input array to csv string with options" do
+    described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
+    described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
+  end
+end
+describe WebCrawler::View::Json do
+  let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
+  let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
+  it "should render input array to json string" do
+    described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
+  end
+  it "should render input hash to json string" do
+    json = described_class.new(input_hash).render
+    json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
+    hash = JSON.parse(json).symbolize_keys
+    hash[:responses].each(&:symbolize_keys!)
+    hash.should == { responses: input_hash }
+  end
+end
+describe WebCrawler::View::Xml do
+  let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
+  let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
+  it "should render input array to xml string" do
+    xml = "<responses>" <<
+        "<response><title>1</title><url>2</url><author>3</author></response>" <<
+        "<response><title>string</title><url>other string\n</url><author></author></response>" <<
+        "</responses>"
+    described_class.new(input, headers: [:title, :url, :author]).render.should == xml
+  end
+  it "should render input array to pretty xml string" do
+    xml = "<responses>\n" <<
+        "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
+        "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
+        "</responses>"
+    described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
+  end
+  it "should render input array without :headers to xml string" do
+    xml = "<responses>\n" <<
+        "<response><field_1>1</field_1><field_2>2</field_2><field_3>3</field_3></response>\n" <<
+        "<response><field_1>string</field_1><field_2>other string\n</field_2><field_3></field_3></response>\n" <<
+        "</responses>"
+    described_class.new(input, pretty: true).render.should == xml
+  end
+  it "should render input hash to xml string" do
+    xml = "<responses>\n" <<
+        "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
+        "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
+        "</responses>"
+    described_class.new(input_hash, pretty: true).render.should == xml
+  end
+end
+describe WebCrawler::View do
+  it "should factory a view from view type" do
+    WebCrawler::View.factory('json', [1, 2, 3]).should be_a WebCrawler::View::Json
+    WebCrawler::View.factory('xml', [1, 2, 3]).should be_a WebCrawler::View::Xml
+    WebCrawler::View.factory('table', [1, 2, 3]).should be_a WebCrawler::View::Table
+  end
+  it "should draw view to custom output" do
+    output = ""
+    io = StringIO.new(output)
+    WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
+    output.should == "{\"responses\":[[1,2,3]]}\n"
+  end
+end

data/web_crawler.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "web_crawler/version"
+Gem::Specification.new do |s|
+  s.name        = "web_crawler"
+  s.version     = WebCrawler::VERSION::STRING
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Anton Sozontov"]
+  s.email       = ["a.sozontov@gmail.com"]
+  s.homepage    = ""
+  s.summary     = %q{Web crawler help you with parse and collect data from the web}
+  s.description = %q{Web crawler help you with parse and collect data from the web}
+  s.rubyforge_project = "web_crawler"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.has_rdoc = false
+  s.bindir = "bin"
+  s.add_dependency 'thor'
+  s.add_development_dependency(%q<rspec>, [">=2.6"])
+  s.add_development_dependency(%q<fakeweb>)
+end