web_crawler 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/Gemfile +11 -0
- data/README +1 -0
- data/Rakefile +2 -0
- data/bin/wcrawler +13 -0
- data/lib/ext/array.rb +100 -0
- data/lib/ext/hash.rb +45 -0
- data/lib/ext/http_response.rb +19 -0
- data/lib/web_crawler/application.rb +49 -0
- data/lib/web_crawler/batch_request.rb +63 -0
- data/lib/web_crawler/cache_adapter/base.rb +33 -0
- data/lib/web_crawler/cache_adapter/file.rb +52 -0
- data/lib/web_crawler/cache_adapter/memory.rb +23 -0
- data/lib/web_crawler/cache_adapter.rb +11 -0
- data/lib/web_crawler/cached_request.rb +30 -0
- data/lib/web_crawler/cli/thor_hooks.rb +94 -0
- data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
- data/lib/web_crawler/cli.rb +122 -0
- data/lib/web_crawler/configuration.rb +87 -0
- data/lib/web_crawler/factory_url.rb +58 -0
- data/lib/web_crawler/follower.rb +26 -0
- data/lib/web_crawler/handler.rb +45 -0
- data/lib/web_crawler/parsers/url.rb +52 -0
- data/lib/web_crawler/parsers.rb +5 -0
- data/lib/web_crawler/request.rb +59 -0
- data/lib/web_crawler/response.rb +45 -0
- data/lib/web_crawler/utility.rb +65 -0
- data/lib/web_crawler/version.rb +9 -0
- data/lib/web_crawler/view/csv.rb +20 -0
- data/lib/web_crawler/view/json.rb +9 -0
- data/lib/web_crawler/view/plain.rb +9 -0
- data/lib/web_crawler/view/runner.rb +20 -0
- data/lib/web_crawler/view/table.rb +69 -0
- data/lib/web_crawler/view/xml.rb +38 -0
- data/lib/web_crawler/view.rb +44 -0
- data/lib/web_crawler.rb +38 -0
- data/spec/fake_web_generator.rb +44 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/web_crawler/batch_request_spec.rb +45 -0
- data/spec/web_crawler/cached_request_spec.rb +31 -0
- data/spec/web_crawler/factory_url_spec.rb +34 -0
- data/spec/web_crawler/follow_spec.rb +32 -0
- data/spec/web_crawler/request_spec.rb +29 -0
- data/spec/web_crawler/response_spec.rb +27 -0
- data/spec/web_crawler/url_parser_spec.rb +41 -0
- data/spec/web_crawler/view_spec.rb +95 -0
- data/web_crawler.gemspec +30 -0
- metadata +151 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module WebCrawler::View
|
4
|
+
# Render a table.
|
5
|
+
#
|
6
|
+
# ==== Parameters
|
7
|
+
# Array[Array[String, String, ...]]
|
8
|
+
#
|
9
|
+
# ==== Options
|
10
|
+
# ident<Integer>:: Indent the first column by ident value.
|
11
|
+
# colwidth<Integer>:: Force the first column to colwidth spaces wide.
|
12
|
+
#
|
13
|
+
class Table < Base
|
14
|
+
|
15
|
+
def render
|
16
|
+
format_table(@input)
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def format_table(table)
|
22
|
+
return if table.empty?
|
23
|
+
|
24
|
+
formats, ident, colwidth = [], @options[:ident].to_i, @options[:colwidth]
|
25
|
+
@options[:truncate] = terminal_width if @options[:truncate] == true
|
26
|
+
|
27
|
+
formats << "%-#{colwidth + 2}s" if colwidth
|
28
|
+
start = colwidth ? 1 : 0
|
29
|
+
|
30
|
+
start.upto(table.first.length - 2) do |i|
|
31
|
+
maxima ||= table.max { |a, b| a[i].size <=> b[i].size }[i].size
|
32
|
+
formats << "%-#{maxima + 2}s"
|
33
|
+
end
|
34
|
+
|
35
|
+
formats[0] = formats[0].insert(0, " " * ident)
|
36
|
+
formats << "%s"
|
37
|
+
|
38
|
+
table.map do |row|
|
39
|
+
sentence = ""
|
40
|
+
|
41
|
+
row.each_with_index do |column, i|
|
42
|
+
sentence << formats[i] % column.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
sentence = truncate(sentence, @options[:truncate]) if @options[:truncate]
|
46
|
+
sentence
|
47
|
+
end.join "\n"
|
48
|
+
end
|
49
|
+
|
50
|
+
def terminal_width
|
51
|
+
if ENV['THOR_COLUMNS']
|
52
|
+
result = ENV['THOR_COLUMNS'].to_i
|
53
|
+
else
|
54
|
+
result = unix? ? dynamic_width : 80
|
55
|
+
end
|
56
|
+
(result < 10) ? 80 : result
|
57
|
+
rescue
|
58
|
+
80
|
59
|
+
end
|
60
|
+
|
61
|
+
def truncate(string, width)
|
62
|
+
if string.length <= width
|
63
|
+
string
|
64
|
+
else
|
65
|
+
(string[0, width-3] || "") + "..."
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WebCrawler::View
|
2
|
+
class Xml < Base
|
3
|
+
|
4
|
+
self.default_options = { pretty: true }
|
5
|
+
|
6
|
+
def render
|
7
|
+
@options[:headers] ||= input.max_by(&:size).each_with_index.map { |_, index| "field_#{index+1}" }
|
8
|
+
"<responses>#{pretty}#{super}</responses>"
|
9
|
+
end
|
10
|
+
|
11
|
+
def format(item)
|
12
|
+
response_tag item.is_a?(Hash) ? item : Hash[@options[:headers].zip item]
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def response_tag(hash)
|
18
|
+
tag(:response) do
|
19
|
+
hash.map do |tag, value|
|
20
|
+
"<#{tag}>#{value}</#{tag}>"
|
21
|
+
end.join
|
22
|
+
end + pretty
|
23
|
+
end
|
24
|
+
|
25
|
+
def pretty
|
26
|
+
@options[:pretty] ? "\n" : ""
|
27
|
+
end
|
28
|
+
|
29
|
+
def tag(name, value="", &block)
|
30
|
+
value << block.call if block_given?
|
31
|
+
unless value.empty?
|
32
|
+
"<#{name}>#{value}</#{name}>"
|
33
|
+
else
|
34
|
+
"<#{name}/>"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module WebCrawler::View
|
2
|
+
|
3
|
+
autoload :Csv, 'web_crawler/view/csv'
|
4
|
+
autoload :Json, 'web_crawler/view/json'
|
5
|
+
autoload :Xml, 'web_crawler/view/xml'
|
6
|
+
autoload :Plain, 'web_crawler/view/plain'
|
7
|
+
autoload :Table, 'web_crawler/view/table'
|
8
|
+
autoload :Runner, 'web_crawler/view/runner'
|
9
|
+
|
10
|
+
extend self
|
11
|
+
|
12
|
+
def factory(type, *args, &block)
|
13
|
+
const_get(WebCrawler::Utility.camelize(type).to_sym).new(*args, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
class Base
|
17
|
+
attr_reader :input
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_accessor :default_options
|
21
|
+
def default_options
|
22
|
+
@default_options ||= { }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(input, options = { })
|
27
|
+
@options = self.class.default_options.merge (options || { })
|
28
|
+
@input = input
|
29
|
+
end
|
30
|
+
|
31
|
+
def render
|
32
|
+
[*input].map { |i| format(i) }.join
|
33
|
+
end
|
34
|
+
|
35
|
+
def draw(output=$stdout)
|
36
|
+
output.puts render
|
37
|
+
end
|
38
|
+
|
39
|
+
def format(item)
|
40
|
+
item
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/lib/web_crawler.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require "net/http"
|
2
|
+
require "net/https"
|
3
|
+
require 'uri'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
require "ext/hash"
|
7
|
+
require "ext/array"
|
8
|
+
require "ext/http_response"
|
9
|
+
|
10
|
+
module WebCrawler
|
11
|
+
autoload :Request, 'web_crawler/request'
|
12
|
+
autoload :CachedRequest, 'web_crawler/cached_request'
|
13
|
+
autoload :Response, 'web_crawler/response'
|
14
|
+
autoload :BatchRequest, 'web_crawler/batch_request'
|
15
|
+
autoload :Handler, 'web_crawler/handler'
|
16
|
+
autoload :HandlerParser, 'web_crawler/handler'
|
17
|
+
autoload :CacheAdapter, 'web_crawler/cache_adapter'
|
18
|
+
autoload :Configurable, 'web_crawler/configuration'
|
19
|
+
autoload :Configuration, 'web_crawler/configuration'
|
20
|
+
|
21
|
+
autoload :FactoryUrl, 'web_crawler/factory_url'
|
22
|
+
autoload :Follower, 'web_crawler/follower'
|
23
|
+
autoload :Parsers, 'web_crawler/parsers'
|
24
|
+
autoload :Utility, 'web_crawler/utility'
|
25
|
+
|
26
|
+
autoload :View, 'web_crawler/view'
|
27
|
+
autoload :CLI, 'web_crawler/cli'
|
28
|
+
autoload :Application, 'web_crawler/application'
|
29
|
+
|
30
|
+
include Configurable
|
31
|
+
extend Utility
|
32
|
+
|
33
|
+
def self.logger
|
34
|
+
config.logger
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module FakeWebGenerator
|
2
|
+
|
3
|
+
def self.included(base)
|
4
|
+
generate_web(['http://otherhost.ru/1',
|
5
|
+
'http://otherhost.ru/2',
|
6
|
+
'http://otherhost.ru/3',
|
7
|
+
'http://example.com/1',
|
8
|
+
'http://example.com/2',
|
9
|
+
'http://example.com/3',
|
10
|
+
'http://example.com/2323.html',
|
11
|
+
'http://example.com/2323.html?rr=1',
|
12
|
+
'http://example.com/follower?rr=1'])
|
13
|
+
|
14
|
+
FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def generate_web(urls)
|
18
|
+
@@known_web_urls ||= []
|
19
|
+
@@known_web_urls << urls
|
20
|
+
@@known_web_urls.flatten!
|
21
|
+
@@known_web_urls.uniq!
|
22
|
+
|
23
|
+
urls.each do |url|
|
24
|
+
FakeWeb.register_uri(:get, url, :body => "Example body for url #{url}")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
module_function :generate_web
|
28
|
+
|
29
|
+
def follower_body
|
30
|
+
"Example body for http://example.com/follower" <<
|
31
|
+
@@known_web_urls.map { |u| "<a href='#{u}'>link text</a>" }.join("\n")
|
32
|
+
end
|
33
|
+
module_function :follower_body
|
34
|
+
|
35
|
+
def urls_board_path
|
36
|
+
'http://example.com/follower'
|
37
|
+
end
|
38
|
+
module_function :urls_board_path
|
39
|
+
|
40
|
+
def known_urls
|
41
|
+
@@known_web_urls
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "/../lib")))
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require "web_crawler"
|
5
|
+
require "fake_web"
|
6
|
+
|
7
|
+
require 'fake_web_generator'
|
8
|
+
|
9
|
+
RSpec.configure do |c|
|
10
|
+
c.mock_with :rspec
|
11
|
+
c.include FakeWebGenerator
|
12
|
+
end
|
13
|
+
|
14
|
+
WebCrawler.configure do
|
15
|
+
config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
4
|
+
FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
|
5
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
6
|
+
|
7
|
+
describe WebCrawler::BatchRequest do
|
8
|
+
|
9
|
+
let(:urls) { ['example.com', 'example.com/1', 'example.com/2'] }
|
10
|
+
let(:http_response) { Net::HTTPResponse.new('', '', '') }
|
11
|
+
let(:responses) { urls.map { |url| WebCrawler::Response.new(url, http_response) } }
|
12
|
+
|
13
|
+
def response(url)
|
14
|
+
WebCrawler::Response.new(url, http_response)
|
15
|
+
end
|
16
|
+
|
17
|
+
def request(url)
|
18
|
+
WebCrawler::Request.new(url).stub(:process).and_return(response(url))
|
19
|
+
end
|
20
|
+
|
21
|
+
subject { described_class.new(urls) }
|
22
|
+
|
23
|
+
it "should initialize batch of requests for given urls" do
|
24
|
+
subject.requests.should be_a Array
|
25
|
+
subject.requests.should have(3).members
|
26
|
+
subject.requests.all? { |r| r.is_a? WebCrawler::Request }.should be_true
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should process requests" do
|
30
|
+
subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
|
31
|
+
subject.process.should be_a Array
|
32
|
+
subject.process.first.should be_a WebCrawler::Response
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should accept :parser option with parser class or object" do
|
36
|
+
class ::TestParser
|
37
|
+
def parse(resp)
|
38
|
+
resp.to_s + ' parsed'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
|
42
|
+
"Example body1 parsed",
|
43
|
+
"Example body for url http://example.com/2 parsed"]
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
4
|
+
FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
|
5
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
6
|
+
|
7
|
+
FakeWeb.allow_net_connect = false
|
8
|
+
|
9
|
+
describe 'Cached requests' do
|
10
|
+
|
11
|
+
let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
|
12
|
+
|
13
|
+
it 'should not send requests to the web if cache exists' do
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
15
|
+
first_response = FakeWeb.response_for :get, "http://example.com/1"
|
16
|
+
|
17
|
+
FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
|
18
|
+
|
19
|
+
lambda {
|
20
|
+
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
|
21
|
+
}.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
|
22
|
+
|
23
|
+
FakeWeb.should_not_receive(:response_for)
|
24
|
+
|
25
|
+
WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
|
26
|
+
|
27
|
+
cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
|
28
|
+
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::FactoryUrl do
|
4
|
+
|
5
|
+
it "should generate urls with block" do
|
6
|
+
first_param = [1,2,3]
|
7
|
+
second_param = 10...15
|
8
|
+
|
9
|
+
factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
|
10
|
+
random = rand(3000)
|
11
|
+
"www.example.com/%s/%s.html?rid=#{random}" % args
|
12
|
+
end
|
13
|
+
urls = factory.factory
|
14
|
+
|
15
|
+
urls.should be_a Array
|
16
|
+
factory.params.size.should == 15
|
17
|
+
urls.should have(factory.params.size).urls
|
18
|
+
urls.first.should =~ /www\.example\.com\/1\/10\.html/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should generate urls with pattern" do
|
22
|
+
first_param = [1,2,3]
|
23
|
+
second_param = 10...15
|
24
|
+
|
25
|
+
factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
|
26
|
+
urls = factory.factory
|
27
|
+
|
28
|
+
urls.should be_a Array
|
29
|
+
factory.params.size.should == 15
|
30
|
+
urls.should have(factory.params.size).urls
|
31
|
+
urls.first.should == "www.example.com/1/10.html"
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
|
4
|
+
describe WebCrawler::Follower do
|
5
|
+
|
6
|
+
it "should collect all uniques urls from responses" do
|
7
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
8
|
+
urls = WebCrawler::Follower.new(responses).collect
|
9
|
+
|
10
|
+
urls.first.should have(9).urls
|
11
|
+
urls.first.should == known_urls
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should collect all the unique url with same host like in responses" do
|
15
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
16
|
+
urls = WebCrawler::Follower.new(responses, same_host: true).collect
|
17
|
+
|
18
|
+
urls.first.should have(6).urls
|
19
|
+
urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should process requests for following urls" do
|
23
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
24
|
+
follower = WebCrawler::Follower.new responses
|
25
|
+
responses += follower.process
|
26
|
+
|
27
|
+
responses.should have(10).responses
|
28
|
+
responses.first.should be_a WebCrawler::Response
|
29
|
+
responses.first.url.to_s.should == urls_board_path
|
30
|
+
responses.last.url.to_s.should == known_urls.last
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Request do
|
4
|
+
|
5
|
+
let(:success_url) { 'example.com/success' }
|
6
|
+
let(:failure_url) { 'example.com/failure' }
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
@body = "Example body"
|
10
|
+
FakeWeb.register_uri(:get, "http://example.com/success", :body => @body, :status => ["200", "OK"])
|
11
|
+
FakeWeb.register_uri(:get, "http://example.com/failure", :body => @body, :status => ["503", "Internal error"])
|
12
|
+
end
|
13
|
+
|
14
|
+
subject { WebCrawler::Request.new(success_url) }
|
15
|
+
|
16
|
+
it "should fetch the url" do
|
17
|
+
subject.process.should be_a WebCrawler::Response
|
18
|
+
subject.process.body.should be @body
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be success" do
|
22
|
+
subject.process.should be_success
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be failure" do
|
26
|
+
WebCrawler::Request.new(failure_url).process.should be_failure
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
4
|
+
|
5
|
+
|
6
|
+
describe WebCrawler::Response do
|
7
|
+
|
8
|
+
let(:url) { 'example.com' }
|
9
|
+
subject { WebCrawler::Request.new(url).process }
|
10
|
+
|
11
|
+
it "should initialize with url and response" do
|
12
|
+
described_class.new url, Net::HTTPResponse.new('', '', '')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should respond to HTTPResponse methods" do
|
16
|
+
[:body, :http_version, :code, :message, :msg, :code_type].each do |meth|
|
17
|
+
subject.should respond_to meth
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "#to_s should be String and equal to #body and not equal to #inspect" do
|
22
|
+
subject.to_s.should be_a String
|
23
|
+
subject.to_s.should be subject.body
|
24
|
+
subject.to_s.should_not be subject.inspect
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Parsers::Url do
|
4
|
+
|
5
|
+
let(:host) { 'example.com' }
|
6
|
+
let(:http_host) { 'http://example.com' }
|
7
|
+
let(:https_host) { 'https://example.com/' }
|
8
|
+
let(:current_page) { '/news/1000.html' }
|
9
|
+
|
10
|
+
it "should add scheme to url" do
|
11
|
+
described_class.new(host).host.to_s.should == 'http://example.com'
|
12
|
+
described_class.new(host, secure: true).host.to_s.should == 'https://example.com'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should parse scheme from url and set @scheme" do
|
16
|
+
described_class.new(https_host).scheme.should == 'https'
|
17
|
+
described_class.new(host, secure: true).scheme.should == 'https'
|
18
|
+
described_class.new(http_host).scheme.should == 'http'
|
19
|
+
described_class.new(host).scheme.should == 'http'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return nil if host not equal to initial host" do
|
23
|
+
described_class.new(host, same_host: true).normalize('example.ru/news?sid=1').should be_nil
|
24
|
+
described_class.new(host, same_host: true).normalize('http://example.ru/news?sid=1').should be_nil
|
25
|
+
described_class.new(host, same_host: true).normalize('https://example.ru/news?sid=1').should be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should join request_uri to initial host" do
|
29
|
+
described_class.new(https_host).normalize('/news').should == 'https://example.com/news'
|
30
|
+
described_class.new(https_host).normalize('/news?sid=1').should == 'https://example.com/news?sid=1'
|
31
|
+
described_class.new(https_host).normalize('/news?sid=1#anchor').should == 'https://example.com/news?sid=1#anchor'
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should join query string to initial current page" do
|
35
|
+
described_class.new(host, url: current_page).normalize('?sid=1').should == 'http://example.com/news/1000.html?sid=1'
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should join fragment string to initial current page" do
|
39
|
+
described_class.new(host, url: current_page).normalize('#anchor').should == 'http://example.com/news/1000.html#anchor'
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
|
4
|
+
describe WebCrawler::View::Csv do
|
5
|
+
|
6
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
7
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n" }] }
|
8
|
+
|
9
|
+
it "should render input array to csv string" do
|
10
|
+
described_class.new(input).render.should == "1,2,3\nstring,\"other string\n\"\n"
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should render input hash to csv string" do
|
14
|
+
described_class.new(input_hash).render.should == "title,url,author\n1,2,3\nstring,\"other string\n\"\n"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should render input array to csv string with options" do
|
18
|
+
described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
+
described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
describe WebCrawler::View::Json do
|
25
|
+
|
26
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
27
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
28
|
+
|
29
|
+
it "should render input array to json string" do
|
30
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should render input hash to json string" do
|
34
|
+
json = described_class.new(input_hash).render
|
35
|
+
json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
|
36
|
+
hash = JSON.parse(json).symbolize_keys
|
37
|
+
hash[:responses].each(&:symbolize_keys!)
|
38
|
+
hash.should == { responses: input_hash }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
describe WebCrawler::View::Xml do
|
44
|
+
|
45
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
46
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
47
|
+
|
48
|
+
it "should render input array to xml string" do
|
49
|
+
xml = "<responses>" <<
|
50
|
+
"<response><title>1</title><url>2</url><author>3</author></response>" <<
|
51
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>" <<
|
52
|
+
"</responses>"
|
53
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should render input array to pretty xml string" do
|
57
|
+
xml = "<responses>\n" <<
|
58
|
+
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
59
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
60
|
+
"</responses>"
|
61
|
+
described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should render input array without :headers to xml string" do
|
65
|
+
xml = "<responses>\n" <<
|
66
|
+
"<response><field_1>1</field_1><field_2>2</field_2><field_3>3</field_3></response>\n" <<
|
67
|
+
"<response><field_1>string</field_1><field_2>other string\n</field_2><field_3></field_3></response>\n" <<
|
68
|
+
"</responses>"
|
69
|
+
described_class.new(input, pretty: true).render.should == xml
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should render input hash to xml string" do
|
73
|
+
xml = "<responses>\n" <<
|
74
|
+
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
75
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
76
|
+
"</responses>"
|
77
|
+
described_class.new(input_hash, pretty: true).render.should == xml
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe WebCrawler::View do
|
82
|
+
|
83
|
+
it "should factory a view from view type" do
|
84
|
+
WebCrawler::View.factory('json', [1, 2, 3]).should be_a WebCrawler::View::Json
|
85
|
+
WebCrawler::View.factory('xml', [1, 2, 3]).should be_a WebCrawler::View::Xml
|
86
|
+
WebCrawler::View.factory('table', [1, 2, 3]).should be_a WebCrawler::View::Table
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should draw view to custom output" do
|
90
|
+
output = ""
|
91
|
+
io = StringIO.new(output)
|
92
|
+
WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
|
93
|
+
output.should == "{\"responses\":[[1,2,3]]}\n"
|
94
|
+
end
|
95
|
+
end
|
data/web_crawler.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "web_crawler/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "web_crawler"
|
7
|
+
s.version = WebCrawler::VERSION::STRING
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Anton Sozontov"]
|
10
|
+
s.email = ["a.sozontov@gmail.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Web crawler help you with parse and collect data from the web}
|
13
|
+
s.description = %q{Web crawler help you with parse and collect data from the web}
|
14
|
+
|
15
|
+
s.rubyforge_project = "web_crawler"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.has_rdoc = false
|
23
|
+
|
24
|
+
s.bindir = "bin"
|
25
|
+
|
26
|
+
s.add_dependency 'thor'
|
27
|
+
|
28
|
+
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
29
|
+
s.add_development_dependency(%q<fakeweb>)
|
30
|
+
end
|