web_crawler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +11 -0
  4. data/README +1 -0
  5. data/Rakefile +2 -0
  6. data/bin/wcrawler +13 -0
  7. data/lib/ext/array.rb +100 -0
  8. data/lib/ext/hash.rb +45 -0
  9. data/lib/ext/http_response.rb +19 -0
  10. data/lib/web_crawler/application.rb +49 -0
  11. data/lib/web_crawler/batch_request.rb +63 -0
  12. data/lib/web_crawler/cache_adapter/base.rb +33 -0
  13. data/lib/web_crawler/cache_adapter/file.rb +52 -0
  14. data/lib/web_crawler/cache_adapter/memory.rb +23 -0
  15. data/lib/web_crawler/cache_adapter.rb +11 -0
  16. data/lib/web_crawler/cached_request.rb +30 -0
  17. data/lib/web_crawler/cli/thor_hooks.rb +94 -0
  18. data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
  19. data/lib/web_crawler/cli.rb +122 -0
  20. data/lib/web_crawler/configuration.rb +87 -0
  21. data/lib/web_crawler/factory_url.rb +58 -0
  22. data/lib/web_crawler/follower.rb +26 -0
  23. data/lib/web_crawler/handler.rb +45 -0
  24. data/lib/web_crawler/parsers/url.rb +52 -0
  25. data/lib/web_crawler/parsers.rb +5 -0
  26. data/lib/web_crawler/request.rb +59 -0
  27. data/lib/web_crawler/response.rb +45 -0
  28. data/lib/web_crawler/utility.rb +65 -0
  29. data/lib/web_crawler/version.rb +9 -0
  30. data/lib/web_crawler/view/csv.rb +20 -0
  31. data/lib/web_crawler/view/json.rb +9 -0
  32. data/lib/web_crawler/view/plain.rb +9 -0
  33. data/lib/web_crawler/view/runner.rb +20 -0
  34. data/lib/web_crawler/view/table.rb +69 -0
  35. data/lib/web_crawler/view/xml.rb +38 -0
  36. data/lib/web_crawler/view.rb +44 -0
  37. data/lib/web_crawler.rb +38 -0
  38. data/spec/fake_web_generator.rb +44 -0
  39. data/spec/spec_helper.rb +17 -0
  40. data/spec/web_crawler/batch_request_spec.rb +45 -0
  41. data/spec/web_crawler/cached_request_spec.rb +31 -0
  42. data/spec/web_crawler/factory_url_spec.rb +34 -0
  43. data/spec/web_crawler/follow_spec.rb +32 -0
  44. data/spec/web_crawler/request_spec.rb +29 -0
  45. data/spec/web_crawler/response_spec.rb +27 -0
  46. data/spec/web_crawler/url_parser_spec.rb +41 -0
  47. data/spec/web_crawler/view_spec.rb +95 -0
  48. data/web_crawler.gemspec +30 -0
  49. metadata +151 -0
@@ -0,0 +1,69 @@
1
+ require 'csv'
2
+
3
+ module WebCrawler::View
4
+ # Render a table.
5
+ #
6
+ # ==== Parameters
7
+ # Array[Array[String, String, ...]]
8
+ #
9
+ # ==== Options
10
+ # ident<Integer>:: Indent the first column by ident value.
11
+ # colwidth<Integer>:: Force the first column to colwidth spaces wide.
12
+ #
13
+ class Table < Base
14
+
15
+ def render
16
+ format_table(@input)
17
+ end
18
+
19
+ protected
20
+
21
+ def format_table(table)
22
+ return if table.empty?
23
+
24
+ formats, ident, colwidth = [], @options[:ident].to_i, @options[:colwidth]
25
+ @options[:truncate] = terminal_width if @options[:truncate] == true
26
+
27
+ formats << "%-#{colwidth + 2}s" if colwidth
28
+ start = colwidth ? 1 : 0
29
+
30
+ start.upto(table.first.length - 2) do |i|
31
+ maxima ||= table.max { |a, b| a[i].size <=> b[i].size }[i].size
32
+ formats << "%-#{maxima + 2}s"
33
+ end
34
+
35
+ formats[0] = formats[0].insert(0, " " * ident)
36
+ formats << "%s"
37
+
38
+ table.map do |row|
39
+ sentence = ""
40
+
41
+ row.each_with_index do |column, i|
42
+ sentence << formats[i] % column.to_s
43
+ end
44
+
45
+ sentence = truncate(sentence, @options[:truncate]) if @options[:truncate]
46
+ sentence
47
+ end.join "\n"
48
+ end
49
+
50
+ def terminal_width
51
+ if ENV['THOR_COLUMNS']
52
+ result = ENV['THOR_COLUMNS'].to_i
53
+ else
54
+ result = unix? ? dynamic_width : 80
55
+ end
56
+ (result < 10) ? 80 : result
57
+ rescue
58
+ 80
59
+ end
60
+
61
+ def truncate(string, width)
62
+ if string.length <= width
63
+ string
64
+ else
65
+ (string[0, width-3] || "") + "..."
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,38 @@
1
+ module WebCrawler::View
2
+ class Xml < Base
3
+
4
+ self.default_options = { pretty: true }
5
+
6
+ def render
7
+ @options[:headers] ||= input.max_by(&:size).each_with_index.map { |_, index| "field_#{index+1}" }
8
+ "<responses>#{pretty}#{super}</responses>"
9
+ end
10
+
11
+ def format(item)
12
+ response_tag item.is_a?(Hash) ? item : Hash[@options[:headers].zip item]
13
+ end
14
+
15
+ protected
16
+
17
+ def response_tag(hash)
18
+ tag(:response) do
19
+ hash.map do |tag, value|
20
+ "<#{tag}>#{value}</#{tag}>"
21
+ end.join
22
+ end + pretty
23
+ end
24
+
25
+ def pretty
26
+ @options[:pretty] ? "\n" : ""
27
+ end
28
+
29
+ def tag(name, value="", &block)
30
+ value << block.call if block_given?
31
+ unless value.empty?
32
+ "<#{name}>#{value}</#{name}>"
33
+ else
34
+ "<#{name}/>"
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,44 @@
1
+ module WebCrawler::View
2
+
3
+ autoload :Csv, 'web_crawler/view/csv'
4
+ autoload :Json, 'web_crawler/view/json'
5
+ autoload :Xml, 'web_crawler/view/xml'
6
+ autoload :Plain, 'web_crawler/view/plain'
7
+ autoload :Table, 'web_crawler/view/table'
8
+ autoload :Runner, 'web_crawler/view/runner'
9
+
10
+ extend self
11
+
12
+ def factory(type, *args, &block)
13
+ const_get(WebCrawler::Utility.camelize(type).to_sym).new(*args, &block)
14
+ end
15
+
16
+ class Base
17
+ attr_reader :input
18
+
19
+ class << self
20
+ attr_accessor :default_options
21
+ def default_options
22
+ @default_options ||= { }
23
+ end
24
+ end
25
+
26
+ def initialize(input, options = { })
27
+ @options = self.class.default_options.merge (options || { })
28
+ @input = input
29
+ end
30
+
31
+ def render
32
+ [*input].map { |i| format(i) }.join
33
+ end
34
+
35
+ def draw(output=$stdout)
36
+ output.puts render
37
+ end
38
+
39
+ def format(item)
40
+ item
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,38 @@
1
+ require "net/http"
2
+ require "net/https"
3
+ require 'uri'
4
+ require 'forwardable'
5
+
6
+ require "ext/hash"
7
+ require "ext/array"
8
+ require "ext/http_response"
9
+
10
+ module WebCrawler
11
+ autoload :Request, 'web_crawler/request'
12
+ autoload :CachedRequest, 'web_crawler/cached_request'
13
+ autoload :Response, 'web_crawler/response'
14
+ autoload :BatchRequest, 'web_crawler/batch_request'
15
+ autoload :Handler, 'web_crawler/handler'
16
+ autoload :HandlerParser, 'web_crawler/handler'
17
+ autoload :CacheAdapter, 'web_crawler/cache_adapter'
18
+ autoload :Configurable, 'web_crawler/configuration'
19
+ autoload :Configuration, 'web_crawler/configuration'
20
+
21
+ autoload :FactoryUrl, 'web_crawler/factory_url'
22
+ autoload :Follower, 'web_crawler/follower'
23
+ autoload :Parsers, 'web_crawler/parsers'
24
+ autoload :Utility, 'web_crawler/utility'
25
+
26
+ autoload :View, 'web_crawler/view'
27
+ autoload :CLI, 'web_crawler/cli'
28
+ autoload :Application, 'web_crawler/application'
29
+
30
+ include Configurable
31
+ extend Utility
32
+
33
+ def self.logger
34
+ config.logger
35
+ end
36
+
37
+ end
38
+
@@ -0,0 +1,44 @@
1
+ module FakeWebGenerator
2
+
3
+ def self.included(base)
4
+ generate_web(['http://otherhost.ru/1',
5
+ 'http://otherhost.ru/2',
6
+ 'http://otherhost.ru/3',
7
+ 'http://example.com/1',
8
+ 'http://example.com/2',
9
+ 'http://example.com/3',
10
+ 'http://example.com/2323.html',
11
+ 'http://example.com/2323.html?rr=1',
12
+ 'http://example.com/follower?rr=1'])
13
+
14
+ FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
15
+ end
16
+
17
+ def generate_web(urls)
18
+ @@known_web_urls ||= []
19
+ @@known_web_urls << urls
20
+ @@known_web_urls.flatten!
21
+ @@known_web_urls.uniq!
22
+
23
+ urls.each do |url|
24
+ FakeWeb.register_uri(:get, url, :body => "Example body for url #{url}")
25
+ end
26
+ end
27
+ module_function :generate_web
28
+
29
+ def follower_body
30
+ "Example body for http://example.com/follower" <<
31
+ @@known_web_urls.map { |u| "<a href='#{u}'>link text</a>" }.join("\n")
32
+ end
33
+ module_function :follower_body
34
+
35
+ def urls_board_path
36
+ 'http://example.com/follower'
37
+ end
38
+ module_function :urls_board_path
39
+
40
+ def known_urls
41
+ @@known_web_urls
42
+ end
43
+
44
+ end
@@ -0,0 +1,17 @@
1
+ $:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "/../lib")))
2
+
3
+ require 'rspec'
4
+ require "web_crawler"
5
+ require "fake_web"
6
+
7
+ require 'fake_web_generator'
8
+
9
+ RSpec.configure do |c|
10
+ c.mock_with :rspec
11
+ c.include FakeWebGenerator
12
+ end
13
+
14
+ WebCrawler.configure do
15
+ config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
16
+ end
17
+
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
4
+ FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
5
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
6
+
7
+ describe WebCrawler::BatchRequest do
8
+
9
+ let(:urls) { ['example.com', 'example.com/1', 'example.com/2'] }
10
+ let(:http_response) { Net::HTTPResponse.new('', '', '') }
11
+ let(:responses) { urls.map { |url| WebCrawler::Response.new(url, http_response) } }
12
+
13
+ def response(url)
14
+ WebCrawler::Response.new(url, http_response)
15
+ end
16
+
17
+ def request(url)
18
+ WebCrawler::Request.new(url).stub(:process).and_return(response(url))
19
+ end
20
+
21
+ subject { described_class.new(urls) }
22
+
23
+ it "should initialize batch of requests for given urls" do
24
+ subject.requests.should be_a Array
25
+ subject.requests.should have(3).members
26
+ subject.requests.all? { |r| r.is_a? WebCrawler::Request }.should be_true
27
+ end
28
+
29
+ it "should process requests" do
30
+ subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
31
+ subject.process.should be_a Array
32
+ subject.process.first.should be_a WebCrawler::Response
33
+ end
34
+
35
+ it "should accept :parser option with parser class or object" do
36
+ class ::TestParser
37
+ def parse(resp)
38
+ resp.to_s + ' parsed'
39
+ end
40
+ end
41
+ described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
42
+ "Example body1 parsed",
43
+ "Example body for url http://example.com/2 parsed"]
44
+ end
45
+ end
@@ -0,0 +1,31 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
4
+ FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
5
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
6
+
7
+ FakeWeb.allow_net_connect = false
8
+
9
+ describe 'Cached requests' do
10
+
11
+ let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
12
+
13
+ it 'should not send requests to the web if cache exists' do
14
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
15
+ first_response = FakeWeb.response_for :get, "http://example.com/1"
16
+
17
+ FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
18
+
19
+ lambda {
20
+ WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
21
+ }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
22
+
23
+ FakeWeb.should_not_receive(:response_for)
24
+
25
+ WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
26
+
27
+ cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
28
+ WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
29
+ end
30
+
31
+ end
@@ -0,0 +1,34 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::FactoryUrl do
4
+
5
+ it "should generate urls with block" do
6
+ first_param = [1,2,3]
7
+ second_param = 10...15
8
+
9
+ factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
10
+ random = rand(3000)
11
+ "www.example.com/%s/%s.html?rid=#{random}" % args
12
+ end
13
+ urls = factory.factory
14
+
15
+ urls.should be_a Array
16
+ factory.params.size.should == 15
17
+ urls.should have(factory.params.size).urls
18
+ urls.first.should =~ /www\.example\.com\/1\/10\.html/
19
+ end
20
+
21
+ it "should generate urls with pattern" do
22
+ first_param = [1,2,3]
23
+ second_param = 10...15
24
+
25
+ factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
26
+ urls = factory.factory
27
+
28
+ urls.should be_a Array
29
+ factory.params.size.should == 15
30
+ urls.should have(factory.params.size).urls
31
+ urls.first.should == "www.example.com/1/10.html"
32
+ end
33
+
34
+ end
@@ -0,0 +1,32 @@
1
+ require "spec_helper"
2
+
3
+
4
+ describe WebCrawler::Follower do
5
+
6
+ it "should collect all uniques urls from responses" do
7
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
8
+ urls = WebCrawler::Follower.new(responses).collect
9
+
10
+ urls.first.should have(9).urls
11
+ urls.first.should == known_urls
12
+ end
13
+
14
+ it "should collect all the unique url with same host like in responses" do
15
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
16
+ urls = WebCrawler::Follower.new(responses, same_host: true).collect
17
+
18
+ urls.first.should have(6).urls
19
+ urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
20
+ end
21
+
22
+ it "should process requests for following urls" do
23
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
24
+ follower = WebCrawler::Follower.new responses
25
+ responses += follower.process
26
+
27
+ responses.should have(10).responses
28
+ responses.first.should be_a WebCrawler::Response
29
+ responses.first.url.to_s.should == urls_board_path
30
+ responses.last.url.to_s.should == known_urls.last
31
+ end
32
+ end
@@ -0,0 +1,29 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Request do
4
+
5
+ let(:success_url) { 'example.com/success' }
6
+ let(:failure_url) { 'example.com/failure' }
7
+
8
+ before(:each) do
9
+ @body = "Example body"
10
+ FakeWeb.register_uri(:get, "http://example.com/success", :body => @body, :status => ["200", "OK"])
11
+ FakeWeb.register_uri(:get, "http://example.com/failure", :body => @body, :status => ["503", "Internal error"])
12
+ end
13
+
14
+ subject { WebCrawler::Request.new(success_url) }
15
+
16
+ it "should fetch the url" do
17
+ subject.process.should be_a WebCrawler::Response
18
+ subject.process.body.should be @body
19
+ end
20
+
21
+ it "should be success" do
22
+ subject.process.should be_success
23
+ end
24
+
25
+ it "should be failure" do
26
+ WebCrawler::Request.new(failure_url).process.should be_failure
27
+ end
28
+
29
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
4
+
5
+
6
+ describe WebCrawler::Response do
7
+
8
+ let(:url) { 'example.com' }
9
+ subject { WebCrawler::Request.new(url).process }
10
+
11
+ it "should initialize with url and response" do
12
+ described_class.new url, Net::HTTPResponse.new('', '', '')
13
+ end
14
+
15
+ it "should respond to HTTPResponse methods" do
16
+ [:body, :http_version, :code, :message, :msg, :code_type].each do |meth|
17
+ subject.should respond_to meth
18
+ end
19
+ end
20
+
21
+ it "#to_s should be String and equal to #body and not equal to #inspect" do
22
+ subject.to_s.should be_a String
23
+ subject.to_s.should be subject.body
24
+ subject.to_s.should_not be subject.inspect
25
+ end
26
+
27
+ end
@@ -0,0 +1,41 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Parsers::Url do
4
+
5
+ let(:host) { 'example.com' }
6
+ let(:http_host) { 'http://example.com' }
7
+ let(:https_host) { 'https://example.com/' }
8
+ let(:current_page) { '/news/1000.html' }
9
+
10
+ it "should add scheme to url" do
11
+ described_class.new(host).host.to_s.should == 'http://example.com'
12
+ described_class.new(host, secure: true).host.to_s.should == 'https://example.com'
13
+ end
14
+
15
+ it "should parse scheme from url and set @scheme" do
16
+ described_class.new(https_host).scheme.should == 'https'
17
+ described_class.new(host, secure: true).scheme.should == 'https'
18
+ described_class.new(http_host).scheme.should == 'http'
19
+ described_class.new(host).scheme.should == 'http'
20
+ end
21
+
22
+ it "should return nil if host not equal to initial host" do
23
+ described_class.new(host, same_host: true).normalize('example.ru/news?sid=1').should be_nil
24
+ described_class.new(host, same_host: true).normalize('http://example.ru/news?sid=1').should be_nil
25
+ described_class.new(host, same_host: true).normalize('https://example.ru/news?sid=1').should be_nil
26
+ end
27
+
28
+ it "should join request_uri to initial host" do
29
+ described_class.new(https_host).normalize('/news').should == 'https://example.com/news'
30
+ described_class.new(https_host).normalize('/news?sid=1').should == 'https://example.com/news?sid=1'
31
+ described_class.new(https_host).normalize('/news?sid=1#anchor').should == 'https://example.com/news?sid=1#anchor'
32
+ end
33
+
34
+ it "should join query string to initial current page" do
35
+ described_class.new(host, url: current_page).normalize('?sid=1').should == 'http://example.com/news/1000.html?sid=1'
36
+ end
37
+
38
+ it "should join fragment string to initial current page" do
39
+ described_class.new(host, url: current_page).normalize('#anchor').should == 'http://example.com/news/1000.html#anchor'
40
+ end
41
+ end
@@ -0,0 +1,95 @@
1
+ require "spec_helper"
2
+
3
+
4
+ describe WebCrawler::View::Csv do
5
+
6
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
7
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n" }] }
8
+
9
+ it "should render input array to csv string" do
10
+ described_class.new(input).render.should == "1,2,3\nstring,\"other string\n\"\n"
11
+ end
12
+
13
+ it "should render input hash to csv string" do
14
+ described_class.new(input_hash).render.should == "title,url,author\n1,2,3\nstring,\"other string\n\"\n"
15
+ end
16
+
17
+ it "should render input array to csv string with options" do
18
+ described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
+ described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
20
+ end
21
+
22
+ end
23
+
24
+ describe WebCrawler::View::Json do
25
+
26
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
27
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
28
+
29
+ it "should render input array to json string" do
30
+ described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
31
+ end
32
+
33
+ it "should render input hash to json string" do
34
+ json = described_class.new(input_hash).render
35
+ json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
36
+ hash = JSON.parse(json).symbolize_keys
37
+ hash[:responses].each(&:symbolize_keys!)
38
+ hash.should == { responses: input_hash }
39
+ end
40
+ end
41
+
42
+
43
+ describe WebCrawler::View::Xml do
44
+
45
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
46
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
47
+
48
+ it "should render input array to xml string" do
49
+ xml = "<responses>" <<
50
+ "<response><title>1</title><url>2</url><author>3</author></response>" <<
51
+ "<response><title>string</title><url>other string\n</url><author></author></response>" <<
52
+ "</responses>"
53
+ described_class.new(input, headers: [:title, :url, :author]).render.should == xml
54
+ end
55
+
56
+ it "should render input array to pretty xml string" do
57
+ xml = "<responses>\n" <<
58
+ "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
59
+ "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
60
+ "</responses>"
61
+ described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
62
+ end
63
+
64
+ it "should render input array without :headers to xml string" do
65
+ xml = "<responses>\n" <<
66
+ "<response><field_1>1</field_1><field_2>2</field_2><field_3>3</field_3></response>\n" <<
67
+ "<response><field_1>string</field_1><field_2>other string\n</field_2><field_3></field_3></response>\n" <<
68
+ "</responses>"
69
+ described_class.new(input, pretty: true).render.should == xml
70
+ end
71
+
72
+ it "should render input hash to xml string" do
73
+ xml = "<responses>\n" <<
74
+ "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
75
+ "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
76
+ "</responses>"
77
+ described_class.new(input_hash, pretty: true).render.should == xml
78
+ end
79
+ end
80
+
81
+ describe WebCrawler::View do
82
+
83
+ it "should factory a view from view type" do
84
+ WebCrawler::View.factory('json', [1, 2, 3]).should be_a WebCrawler::View::Json
85
+ WebCrawler::View.factory('xml', [1, 2, 3]).should be_a WebCrawler::View::Xml
86
+ WebCrawler::View.factory('table', [1, 2, 3]).should be_a WebCrawler::View::Table
87
+ end
88
+
89
+ it "should draw view to custom output" do
90
+ output = ""
91
+ io = StringIO.new(output)
92
+ WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
93
+ output.should == "{\"responses\":[[1,2,3]]}\n"
94
+ end
95
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "web_crawler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "web_crawler"
7
+ s.version = WebCrawler::VERSION::STRING
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Anton Sozontov"]
10
+ s.email = ["a.sozontov@gmail.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{Web crawler help you with parse and collect data from the web}
13
+ s.description = %q{Web crawler help you with parse and collect data from the web}
14
+
15
+ s.rubyforge_project = "web_crawler"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.has_rdoc = false
23
+
24
+ s.bindir = "bin"
25
+
26
+ s.add_dependency 'thor'
27
+
28
+ s.add_development_dependency(%q<rspec>, [">=2.6"])
29
+ s.add_development_dependency(%q<fakeweb>)
30
+ end