web_crawler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/.rspec +1 -0
- data/Gemfile +11 -0
- data/README +1 -0
- data/Rakefile +2 -0
- data/bin/wcrawler +13 -0
- data/lib/ext/array.rb +100 -0
- data/lib/ext/hash.rb +45 -0
- data/lib/ext/http_response.rb +19 -0
- data/lib/web_crawler/application.rb +49 -0
- data/lib/web_crawler/batch_request.rb +63 -0
- data/lib/web_crawler/cache_adapter/base.rb +33 -0
- data/lib/web_crawler/cache_adapter/file.rb +52 -0
- data/lib/web_crawler/cache_adapter/memory.rb +23 -0
- data/lib/web_crawler/cache_adapter.rb +11 -0
- data/lib/web_crawler/cached_request.rb +30 -0
- data/lib/web_crawler/cli/thor_hooks.rb +94 -0
- data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
- data/lib/web_crawler/cli.rb +122 -0
- data/lib/web_crawler/configuration.rb +87 -0
- data/lib/web_crawler/factory_url.rb +58 -0
- data/lib/web_crawler/follower.rb +26 -0
- data/lib/web_crawler/handler.rb +45 -0
- data/lib/web_crawler/parsers/url.rb +52 -0
- data/lib/web_crawler/parsers.rb +5 -0
- data/lib/web_crawler/request.rb +59 -0
- data/lib/web_crawler/response.rb +45 -0
- data/lib/web_crawler/utility.rb +65 -0
- data/lib/web_crawler/version.rb +9 -0
- data/lib/web_crawler/view/csv.rb +20 -0
- data/lib/web_crawler/view/json.rb +9 -0
- data/lib/web_crawler/view/plain.rb +9 -0
- data/lib/web_crawler/view/runner.rb +20 -0
- data/lib/web_crawler/view/table.rb +69 -0
- data/lib/web_crawler/view/xml.rb +38 -0
- data/lib/web_crawler/view.rb +44 -0
- data/lib/web_crawler.rb +38 -0
- data/spec/fake_web_generator.rb +44 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/web_crawler/batch_request_spec.rb +45 -0
- data/spec/web_crawler/cached_request_spec.rb +31 -0
- data/spec/web_crawler/factory_url_spec.rb +34 -0
- data/spec/web_crawler/follow_spec.rb +32 -0
- data/spec/web_crawler/request_spec.rb +29 -0
- data/spec/web_crawler/response_spec.rb +27 -0
- data/spec/web_crawler/url_parser_spec.rb +41 -0
- data/spec/web_crawler/view_spec.rb +95 -0
- data/web_crawler.gemspec +30 -0
- metadata +151 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module WebCrawler::View
|
4
|
+
# Render a table.
|
5
|
+
#
|
6
|
+
# ==== Parameters
|
7
|
+
# Array[Array[String, String, ...]]
|
8
|
+
#
|
9
|
+
# ==== Options
|
10
|
+
# ident<Integer>:: Indent the first column by ident value.
|
11
|
+
# colwidth<Integer>:: Force the first column to colwidth spaces wide.
|
12
|
+
#
|
13
|
+
class Table < Base
|
14
|
+
|
15
|
+
def render
|
16
|
+
format_table(@input)
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def format_table(table)
|
22
|
+
return if table.empty?
|
23
|
+
|
24
|
+
formats, ident, colwidth = [], @options[:ident].to_i, @options[:colwidth]
|
25
|
+
@options[:truncate] = terminal_width if @options[:truncate] == true
|
26
|
+
|
27
|
+
formats << "%-#{colwidth + 2}s" if colwidth
|
28
|
+
start = colwidth ? 1 : 0
|
29
|
+
|
30
|
+
start.upto(table.first.length - 2) do |i|
|
31
|
+
maxima ||= table.max { |a, b| a[i].size <=> b[i].size }[i].size
|
32
|
+
formats << "%-#{maxima + 2}s"
|
33
|
+
end
|
34
|
+
|
35
|
+
formats[0] = formats[0].insert(0, " " * ident)
|
36
|
+
formats << "%s"
|
37
|
+
|
38
|
+
table.map do |row|
|
39
|
+
sentence = ""
|
40
|
+
|
41
|
+
row.each_with_index do |column, i|
|
42
|
+
sentence << formats[i] % column.to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
sentence = truncate(sentence, @options[:truncate]) if @options[:truncate]
|
46
|
+
sentence
|
47
|
+
end.join "\n"
|
48
|
+
end
|
49
|
+
|
50
|
+
def terminal_width
|
51
|
+
if ENV['THOR_COLUMNS']
|
52
|
+
result = ENV['THOR_COLUMNS'].to_i
|
53
|
+
else
|
54
|
+
result = unix? ? dynamic_width : 80
|
55
|
+
end
|
56
|
+
(result < 10) ? 80 : result
|
57
|
+
rescue
|
58
|
+
80
|
59
|
+
end
|
60
|
+
|
61
|
+
def truncate(string, width)
|
62
|
+
if string.length <= width
|
63
|
+
string
|
64
|
+
else
|
65
|
+
(string[0, width-3] || "") + "..."
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module WebCrawler::View
|
2
|
+
class Xml < Base
|
3
|
+
|
4
|
+
self.default_options = { pretty: true }
|
5
|
+
|
6
|
+
def render
|
7
|
+
@options[:headers] ||= input.max_by(&:size).each_with_index.map { |_, index| "field_#{index+1}" }
|
8
|
+
"<responses>#{pretty}#{super}</responses>"
|
9
|
+
end
|
10
|
+
|
11
|
+
def format(item)
|
12
|
+
response_tag item.is_a?(Hash) ? item : Hash[@options[:headers].zip item]
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def response_tag(hash)
|
18
|
+
tag(:response) do
|
19
|
+
hash.map do |tag, value|
|
20
|
+
"<#{tag}>#{value}</#{tag}>"
|
21
|
+
end.join
|
22
|
+
end + pretty
|
23
|
+
end
|
24
|
+
|
25
|
+
def pretty
|
26
|
+
@options[:pretty] ? "\n" : ""
|
27
|
+
end
|
28
|
+
|
29
|
+
def tag(name, value="", &block)
|
30
|
+
value << block.call if block_given?
|
31
|
+
unless value.empty?
|
32
|
+
"<#{name}>#{value}</#{name}>"
|
33
|
+
else
|
34
|
+
"<#{name}/>"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module WebCrawler::View
|
2
|
+
|
3
|
+
autoload :Csv, 'web_crawler/view/csv'
|
4
|
+
autoload :Json, 'web_crawler/view/json'
|
5
|
+
autoload :Xml, 'web_crawler/view/xml'
|
6
|
+
autoload :Plain, 'web_crawler/view/plain'
|
7
|
+
autoload :Table, 'web_crawler/view/table'
|
8
|
+
autoload :Runner, 'web_crawler/view/runner'
|
9
|
+
|
10
|
+
extend self
|
11
|
+
|
12
|
+
def factory(type, *args, &block)
|
13
|
+
const_get(WebCrawler::Utility.camelize(type).to_sym).new(*args, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
class Base
|
17
|
+
attr_reader :input
|
18
|
+
|
19
|
+
class << self
|
20
|
+
attr_accessor :default_options
|
21
|
+
def default_options
|
22
|
+
@default_options ||= { }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(input, options = { })
|
27
|
+
@options = self.class.default_options.merge (options || { })
|
28
|
+
@input = input
|
29
|
+
end
|
30
|
+
|
31
|
+
def render
|
32
|
+
[*input].map { |i| format(i) }.join
|
33
|
+
end
|
34
|
+
|
35
|
+
def draw(output=$stdout)
|
36
|
+
output.puts render
|
37
|
+
end
|
38
|
+
|
39
|
+
def format(item)
|
40
|
+
item
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/lib/web_crawler.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require "net/http"
|
2
|
+
require "net/https"
|
3
|
+
require 'uri'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
require "ext/hash"
|
7
|
+
require "ext/array"
|
8
|
+
require "ext/http_response"
|
9
|
+
|
10
|
+
module WebCrawler
|
11
|
+
autoload :Request, 'web_crawler/request'
|
12
|
+
autoload :CachedRequest, 'web_crawler/cached_request'
|
13
|
+
autoload :Response, 'web_crawler/response'
|
14
|
+
autoload :BatchRequest, 'web_crawler/batch_request'
|
15
|
+
autoload :Handler, 'web_crawler/handler'
|
16
|
+
autoload :HandlerParser, 'web_crawler/handler'
|
17
|
+
autoload :CacheAdapter, 'web_crawler/cache_adapter'
|
18
|
+
autoload :Configurable, 'web_crawler/configuration'
|
19
|
+
autoload :Configuration, 'web_crawler/configuration'
|
20
|
+
|
21
|
+
autoload :FactoryUrl, 'web_crawler/factory_url'
|
22
|
+
autoload :Follower, 'web_crawler/follower'
|
23
|
+
autoload :Parsers, 'web_crawler/parsers'
|
24
|
+
autoload :Utility, 'web_crawler/utility'
|
25
|
+
|
26
|
+
autoload :View, 'web_crawler/view'
|
27
|
+
autoload :CLI, 'web_crawler/cli'
|
28
|
+
autoload :Application, 'web_crawler/application'
|
29
|
+
|
30
|
+
include Configurable
|
31
|
+
extend Utility
|
32
|
+
|
33
|
+
def self.logger
|
34
|
+
config.logger
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module FakeWebGenerator
|
2
|
+
|
3
|
+
def self.included(base)
|
4
|
+
generate_web(['http://otherhost.ru/1',
|
5
|
+
'http://otherhost.ru/2',
|
6
|
+
'http://otherhost.ru/3',
|
7
|
+
'http://example.com/1',
|
8
|
+
'http://example.com/2',
|
9
|
+
'http://example.com/3',
|
10
|
+
'http://example.com/2323.html',
|
11
|
+
'http://example.com/2323.html?rr=1',
|
12
|
+
'http://example.com/follower?rr=1'])
|
13
|
+
|
14
|
+
FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def generate_web(urls)
|
18
|
+
@@known_web_urls ||= []
|
19
|
+
@@known_web_urls << urls
|
20
|
+
@@known_web_urls.flatten!
|
21
|
+
@@known_web_urls.uniq!
|
22
|
+
|
23
|
+
urls.each do |url|
|
24
|
+
FakeWeb.register_uri(:get, url, :body => "Example body for url #{url}")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
module_function :generate_web
|
28
|
+
|
29
|
+
def follower_body
|
30
|
+
"Example body for http://example.com/follower" <<
|
31
|
+
@@known_web_urls.map { |u| "<a href='#{u}'>link text</a>" }.join("\n")
|
32
|
+
end
|
33
|
+
module_function :follower_body
|
34
|
+
|
35
|
+
def urls_board_path
|
36
|
+
'http://example.com/follower'
|
37
|
+
end
|
38
|
+
module_function :urls_board_path
|
39
|
+
|
40
|
+
def known_urls
|
41
|
+
@@known_web_urls
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "/../lib")))
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require "web_crawler"
|
5
|
+
require "fake_web"
|
6
|
+
|
7
|
+
require 'fake_web_generator'
|
8
|
+
|
9
|
+
RSpec.configure do |c|
|
10
|
+
c.mock_with :rspec
|
11
|
+
c.include FakeWebGenerator
|
12
|
+
end
|
13
|
+
|
14
|
+
WebCrawler.configure do
|
15
|
+
config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
4
|
+
FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
|
5
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
6
|
+
|
7
|
+
describe WebCrawler::BatchRequest do
|
8
|
+
|
9
|
+
let(:urls) { ['example.com', 'example.com/1', 'example.com/2'] }
|
10
|
+
let(:http_response) { Net::HTTPResponse.new('', '', '') }
|
11
|
+
let(:responses) { urls.map { |url| WebCrawler::Response.new(url, http_response) } }
|
12
|
+
|
13
|
+
def response(url)
|
14
|
+
WebCrawler::Response.new(url, http_response)
|
15
|
+
end
|
16
|
+
|
17
|
+
def request(url)
|
18
|
+
WebCrawler::Request.new(url).stub(:process).and_return(response(url))
|
19
|
+
end
|
20
|
+
|
21
|
+
subject { described_class.new(urls) }
|
22
|
+
|
23
|
+
it "should initialize batch of requests for given urls" do
|
24
|
+
subject.requests.should be_a Array
|
25
|
+
subject.requests.should have(3).members
|
26
|
+
subject.requests.all? { |r| r.is_a? WebCrawler::Request }.should be_true
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should process requests" do
|
30
|
+
subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
|
31
|
+
subject.process.should be_a Array
|
32
|
+
subject.process.first.should be_a WebCrawler::Response
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should accept :parser option with parser class or object" do
|
36
|
+
class ::TestParser
|
37
|
+
def parse(resp)
|
38
|
+
resp.to_s + ' parsed'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
|
42
|
+
"Example body1 parsed",
|
43
|
+
"Example body for url http://example.com/2 parsed"]
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
4
|
+
FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
|
5
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
6
|
+
|
7
|
+
FakeWeb.allow_net_connect = false
|
8
|
+
|
9
|
+
describe 'Cached requests' do
|
10
|
+
|
11
|
+
let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
|
12
|
+
|
13
|
+
it 'should not send requests to the web if cache exists' do
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
|
15
|
+
first_response = FakeWeb.response_for :get, "http://example.com/1"
|
16
|
+
|
17
|
+
FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
|
18
|
+
|
19
|
+
lambda {
|
20
|
+
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
|
21
|
+
}.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
|
22
|
+
|
23
|
+
FakeWeb.should_not_receive(:response_for)
|
24
|
+
|
25
|
+
WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
|
26
|
+
|
27
|
+
cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
|
28
|
+
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::FactoryUrl do
|
4
|
+
|
5
|
+
it "should generate urls with block" do
|
6
|
+
first_param = [1,2,3]
|
7
|
+
second_param = 10...15
|
8
|
+
|
9
|
+
factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
|
10
|
+
random = rand(3000)
|
11
|
+
"www.example.com/%s/%s.html?rid=#{random}" % args
|
12
|
+
end
|
13
|
+
urls = factory.factory
|
14
|
+
|
15
|
+
urls.should be_a Array
|
16
|
+
factory.params.size.should == 15
|
17
|
+
urls.should have(factory.params.size).urls
|
18
|
+
urls.first.should =~ /www\.example\.com\/1\/10\.html/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should generate urls with pattern" do
|
22
|
+
first_param = [1,2,3]
|
23
|
+
second_param = 10...15
|
24
|
+
|
25
|
+
factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
|
26
|
+
urls = factory.factory
|
27
|
+
|
28
|
+
urls.should be_a Array
|
29
|
+
factory.params.size.should == 15
|
30
|
+
urls.should have(factory.params.size).urls
|
31
|
+
urls.first.should == "www.example.com/1/10.html"
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
|
4
|
+
describe WebCrawler::Follower do
|
5
|
+
|
6
|
+
it "should collect all uniques urls from responses" do
|
7
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
8
|
+
urls = WebCrawler::Follower.new(responses).collect
|
9
|
+
|
10
|
+
urls.first.should have(9).urls
|
11
|
+
urls.first.should == known_urls
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should collect all the unique url with same host like in responses" do
|
15
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
16
|
+
urls = WebCrawler::Follower.new(responses, same_host: true).collect
|
17
|
+
|
18
|
+
urls.first.should have(6).urls
|
19
|
+
urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should process requests for following urls" do
|
23
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
24
|
+
follower = WebCrawler::Follower.new responses
|
25
|
+
responses += follower.process
|
26
|
+
|
27
|
+
responses.should have(10).responses
|
28
|
+
responses.first.should be_a WebCrawler::Response
|
29
|
+
responses.first.url.to_s.should == urls_board_path
|
30
|
+
responses.last.url.to_s.should == known_urls.last
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Request do
|
4
|
+
|
5
|
+
let(:success_url) { 'example.com/success' }
|
6
|
+
let(:failure_url) { 'example.com/failure' }
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
@body = "Example body"
|
10
|
+
FakeWeb.register_uri(:get, "http://example.com/success", :body => @body, :status => ["200", "OK"])
|
11
|
+
FakeWeb.register_uri(:get, "http://example.com/failure", :body => @body, :status => ["503", "Internal error"])
|
12
|
+
end
|
13
|
+
|
14
|
+
subject { WebCrawler::Request.new(success_url) }
|
15
|
+
|
16
|
+
it "should fetch the url" do
|
17
|
+
subject.process.should be_a WebCrawler::Response
|
18
|
+
subject.process.body.should be @body
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should be success" do
|
22
|
+
subject.process.should be_success
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be failure" do
|
26
|
+
WebCrawler::Request.new(failure_url).process.should be_failure
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
|
4
|
+
|
5
|
+
|
6
|
+
describe WebCrawler::Response do
|
7
|
+
|
8
|
+
let(:url) { 'example.com' }
|
9
|
+
subject { WebCrawler::Request.new(url).process }
|
10
|
+
|
11
|
+
it "should initialize with url and response" do
|
12
|
+
described_class.new url, Net::HTTPResponse.new('', '', '')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should respond to HTTPResponse methods" do
|
16
|
+
[:body, :http_version, :code, :message, :msg, :code_type].each do |meth|
|
17
|
+
subject.should respond_to meth
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "#to_s should be String and equal to #body and not equal to #inspect" do
|
22
|
+
subject.to_s.should be_a String
|
23
|
+
subject.to_s.should be subject.body
|
24
|
+
subject.to_s.should_not be subject.inspect
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Parsers::Url do
|
4
|
+
|
5
|
+
let(:host) { 'example.com' }
|
6
|
+
let(:http_host) { 'http://example.com' }
|
7
|
+
let(:https_host) { 'https://example.com/' }
|
8
|
+
let(:current_page) { '/news/1000.html' }
|
9
|
+
|
10
|
+
it "should add scheme to url" do
|
11
|
+
described_class.new(host).host.to_s.should == 'http://example.com'
|
12
|
+
described_class.new(host, secure: true).host.to_s.should == 'https://example.com'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should parse scheme from url and set @scheme" do
|
16
|
+
described_class.new(https_host).scheme.should == 'https'
|
17
|
+
described_class.new(host, secure: true).scheme.should == 'https'
|
18
|
+
described_class.new(http_host).scheme.should == 'http'
|
19
|
+
described_class.new(host).scheme.should == 'http'
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return nil if host not equal to initial host" do
|
23
|
+
described_class.new(host, same_host: true).normalize('example.ru/news?sid=1').should be_nil
|
24
|
+
described_class.new(host, same_host: true).normalize('http://example.ru/news?sid=1').should be_nil
|
25
|
+
described_class.new(host, same_host: true).normalize('https://example.ru/news?sid=1').should be_nil
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should join request_uri to initial host" do
|
29
|
+
described_class.new(https_host).normalize('/news').should == 'https://example.com/news'
|
30
|
+
described_class.new(https_host).normalize('/news?sid=1').should == 'https://example.com/news?sid=1'
|
31
|
+
described_class.new(https_host).normalize('/news?sid=1#anchor').should == 'https://example.com/news?sid=1#anchor'
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should join query string to initial current page" do
|
35
|
+
described_class.new(host, url: current_page).normalize('?sid=1').should == 'http://example.com/news/1000.html?sid=1'
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should join fragment string to initial current page" do
|
39
|
+
described_class.new(host, url: current_page).normalize('#anchor').should == 'http://example.com/news/1000.html#anchor'
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
|
4
|
+
describe WebCrawler::View::Csv do
|
5
|
+
|
6
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
7
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n" }] }
|
8
|
+
|
9
|
+
it "should render input array to csv string" do
|
10
|
+
described_class.new(input).render.should == "1,2,3\nstring,\"other string\n\"\n"
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should render input hash to csv string" do
|
14
|
+
described_class.new(input_hash).render.should == "title,url,author\n1,2,3\nstring,\"other string\n\"\n"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should render input array to csv string with options" do
|
18
|
+
described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
+
described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
describe WebCrawler::View::Json do
|
25
|
+
|
26
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
27
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
28
|
+
|
29
|
+
it "should render input array to json string" do
|
30
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should render input hash to json string" do
|
34
|
+
json = described_class.new(input_hash).render
|
35
|
+
json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
|
36
|
+
hash = JSON.parse(json).symbolize_keys
|
37
|
+
hash[:responses].each(&:symbolize_keys!)
|
38
|
+
hash.should == { responses: input_hash }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
describe WebCrawler::View::Xml do
|
44
|
+
|
45
|
+
let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
|
46
|
+
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
47
|
+
|
48
|
+
it "should render input array to xml string" do
|
49
|
+
xml = "<responses>" <<
|
50
|
+
"<response><title>1</title><url>2</url><author>3</author></response>" <<
|
51
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>" <<
|
52
|
+
"</responses>"
|
53
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should render input array to pretty xml string" do
|
57
|
+
xml = "<responses>\n" <<
|
58
|
+
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
59
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
60
|
+
"</responses>"
|
61
|
+
described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should render input array without :headers to xml string" do
|
65
|
+
xml = "<responses>\n" <<
|
66
|
+
"<response><field_1>1</field_1><field_2>2</field_2><field_3>3</field_3></response>\n" <<
|
67
|
+
"<response><field_1>string</field_1><field_2>other string\n</field_2><field_3></field_3></response>\n" <<
|
68
|
+
"</responses>"
|
69
|
+
described_class.new(input, pretty: true).render.should == xml
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should render input hash to xml string" do
|
73
|
+
xml = "<responses>\n" <<
|
74
|
+
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
75
|
+
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
76
|
+
"</responses>"
|
77
|
+
described_class.new(input_hash, pretty: true).render.should == xml
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe WebCrawler::View do
|
82
|
+
|
83
|
+
it "should factory a view from view type" do
|
84
|
+
WebCrawler::View.factory('json', [1, 2, 3]).should be_a WebCrawler::View::Json
|
85
|
+
WebCrawler::View.factory('xml', [1, 2, 3]).should be_a WebCrawler::View::Xml
|
86
|
+
WebCrawler::View.factory('table', [1, 2, 3]).should be_a WebCrawler::View::Table
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should draw view to custom output" do
|
90
|
+
output = ""
|
91
|
+
io = StringIO.new(output)
|
92
|
+
WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
|
93
|
+
output.should == "{\"responses\":[[1,2,3]]}\n"
|
94
|
+
end
|
95
|
+
end
|
data/web_crawler.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "web_crawler/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "web_crawler"
|
7
|
+
s.version = WebCrawler::VERSION::STRING
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Anton Sozontov"]
|
10
|
+
s.email = ["a.sozontov@gmail.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Web crawler help you with parse and collect data from the web}
|
13
|
+
s.description = %q{Web crawler help you with parse and collect data from the web}
|
14
|
+
|
15
|
+
s.rubyforge_project = "web_crawler"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.has_rdoc = false
|
23
|
+
|
24
|
+
s.bindir = "bin"
|
25
|
+
|
26
|
+
s.add_dependency 'thor'
|
27
|
+
|
28
|
+
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
29
|
+
s.add_development_dependency(%q<fakeweb>)
|
30
|
+
end
|