web_crawler 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/.gitignore +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +11 -0
  4. data/README +1 -0
  5. data/Rakefile +2 -0
  6. data/bin/wcrawler +13 -0
  7. data/lib/ext/array.rb +100 -0
  8. data/lib/ext/hash.rb +45 -0
  9. data/lib/ext/http_response.rb +19 -0
  10. data/lib/web_crawler/application.rb +49 -0
  11. data/lib/web_crawler/batch_request.rb +63 -0
  12. data/lib/web_crawler/cache_adapter/base.rb +33 -0
  13. data/lib/web_crawler/cache_adapter/file.rb +52 -0
  14. data/lib/web_crawler/cache_adapter/memory.rb +23 -0
  15. data/lib/web_crawler/cache_adapter.rb +11 -0
  16. data/lib/web_crawler/cached_request.rb +30 -0
  17. data/lib/web_crawler/cli/thor_hooks.rb +94 -0
  18. data/lib/web_crawler/cli/thor_inherited_options.rb +26 -0
  19. data/lib/web_crawler/cli.rb +122 -0
  20. data/lib/web_crawler/configuration.rb +87 -0
  21. data/lib/web_crawler/factory_url.rb +58 -0
  22. data/lib/web_crawler/follower.rb +26 -0
  23. data/lib/web_crawler/handler.rb +45 -0
  24. data/lib/web_crawler/parsers/url.rb +52 -0
  25. data/lib/web_crawler/parsers.rb +5 -0
  26. data/lib/web_crawler/request.rb +59 -0
  27. data/lib/web_crawler/response.rb +45 -0
  28. data/lib/web_crawler/utility.rb +65 -0
  29. data/lib/web_crawler/version.rb +9 -0
  30. data/lib/web_crawler/view/csv.rb +20 -0
  31. data/lib/web_crawler/view/json.rb +9 -0
  32. data/lib/web_crawler/view/plain.rb +9 -0
  33. data/lib/web_crawler/view/runner.rb +20 -0
  34. data/lib/web_crawler/view/table.rb +69 -0
  35. data/lib/web_crawler/view/xml.rb +38 -0
  36. data/lib/web_crawler/view.rb +44 -0
  37. data/lib/web_crawler.rb +38 -0
  38. data/spec/fake_web_generator.rb +44 -0
  39. data/spec/spec_helper.rb +17 -0
  40. data/spec/web_crawler/batch_request_spec.rb +45 -0
  41. data/spec/web_crawler/cached_request_spec.rb +31 -0
  42. data/spec/web_crawler/factory_url_spec.rb +34 -0
  43. data/spec/web_crawler/follow_spec.rb +32 -0
  44. data/spec/web_crawler/request_spec.rb +29 -0
  45. data/spec/web_crawler/response_spec.rb +27 -0
  46. data/spec/web_crawler/url_parser_spec.rb +41 -0
  47. data/spec/web_crawler/view_spec.rb +95 -0
  48. data/web_crawler.gemspec +30 -0
  49. metadata +151 -0
@@ -0,0 +1,69 @@
1
+ require 'csv'
2
+
3
+ module WebCrawler::View
4
+ # Render a table.
5
+ #
6
+ # ==== Parameters
7
+ # Array[Array[String, String, ...]]
8
+ #
9
+ # ==== Options
10
+ # ident<Integer>:: Indent the first column by ident value.
11
+ # colwidth<Integer>:: Force the first column to colwidth spaces wide.
12
+ #
13
+ class Table < Base
14
+
15
+ def render
16
+ format_table(@input)
17
+ end
18
+
19
+ protected
20
+
21
+ def format_table(table)
22
+ return if table.empty?
23
+
24
+ formats, ident, colwidth = [], @options[:ident].to_i, @options[:colwidth]
25
+ @options[:truncate] = terminal_width if @options[:truncate] == true
26
+
27
+ formats << "%-#{colwidth + 2}s" if colwidth
28
+ start = colwidth ? 1 : 0
29
+
30
+ start.upto(table.first.length - 2) do |i|
31
+ maxima ||= table.max { |a, b| a[i].size <=> b[i].size }[i].size
32
+ formats << "%-#{maxima + 2}s"
33
+ end
34
+
35
+ formats[0] = formats[0].insert(0, " " * ident)
36
+ formats << "%s"
37
+
38
+ table.map do |row|
39
+ sentence = ""
40
+
41
+ row.each_with_index do |column, i|
42
+ sentence << formats[i] % column.to_s
43
+ end
44
+
45
+ sentence = truncate(sentence, @options[:truncate]) if @options[:truncate]
46
+ sentence
47
+ end.join "\n"
48
+ end
49
+
50
+ def terminal_width
51
+ if ENV['THOR_COLUMNS']
52
+ result = ENV['THOR_COLUMNS'].to_i
53
+ else
54
+ result = unix? ? dynamic_width : 80
55
+ end
56
+ (result < 10) ? 80 : result
57
+ rescue
58
+ 80
59
+ end
60
+
61
+ def truncate(string, width)
62
+ if string.length <= width
63
+ string
64
+ else
65
+ (string[0, width-3] || "") + "..."
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,38 @@
1
+ module WebCrawler::View
2
+ class Xml < Base
3
+
4
+ self.default_options = { pretty: true }
5
+
6
+ def render
7
+ @options[:headers] ||= input.max_by(&:size).each_with_index.map { |_, index| "field_#{index+1}" }
8
+ "<responses>#{pretty}#{super}</responses>"
9
+ end
10
+
11
+ def format(item)
12
+ response_tag item.is_a?(Hash) ? item : Hash[@options[:headers].zip item]
13
+ end
14
+
15
+ protected
16
+
17
+ def response_tag(hash)
18
+ tag(:response) do
19
+ hash.map do |tag, value|
20
+ "<#{tag}>#{value}</#{tag}>"
21
+ end.join
22
+ end + pretty
23
+ end
24
+
25
+ def pretty
26
+ @options[:pretty] ? "\n" : ""
27
+ end
28
+
29
+ def tag(name, value="", &block)
30
+ value << block.call if block_given?
31
+ unless value.empty?
32
+ "<#{name}>#{value}</#{name}>"
33
+ else
34
+ "<#{name}/>"
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,44 @@
1
+ module WebCrawler::View
2
+
3
+ autoload :Csv, 'web_crawler/view/csv'
4
+ autoload :Json, 'web_crawler/view/json'
5
+ autoload :Xml, 'web_crawler/view/xml'
6
+ autoload :Plain, 'web_crawler/view/plain'
7
+ autoload :Table, 'web_crawler/view/table'
8
+ autoload :Runner, 'web_crawler/view/runner'
9
+
10
+ extend self
11
+
12
+ def factory(type, *args, &block)
13
+ const_get(WebCrawler::Utility.camelize(type).to_sym).new(*args, &block)
14
+ end
15
+
16
+ class Base
17
+ attr_reader :input
18
+
19
+ class << self
20
+ attr_accessor :default_options
21
+ def default_options
22
+ @default_options ||= { }
23
+ end
24
+ end
25
+
26
+ def initialize(input, options = { })
27
+ @options = self.class.default_options.merge (options || { })
28
+ @input = input
29
+ end
30
+
31
+ def render
32
+ [*input].map { |i| format(i) }.join
33
+ end
34
+
35
+ def draw(output=$stdout)
36
+ output.puts render
37
+ end
38
+
39
+ def format(item)
40
+ item
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,38 @@
1
+ require "net/http"
2
+ require "net/https"
3
+ require 'uri'
4
+ require 'forwardable'
5
+
6
+ require "ext/hash"
7
+ require "ext/array"
8
+ require "ext/http_response"
9
+
10
+ module WebCrawler
11
+ autoload :Request, 'web_crawler/request'
12
+ autoload :CachedRequest, 'web_crawler/cached_request'
13
+ autoload :Response, 'web_crawler/response'
14
+ autoload :BatchRequest, 'web_crawler/batch_request'
15
+ autoload :Handler, 'web_crawler/handler'
16
+ autoload :HandlerParser, 'web_crawler/handler'
17
+ autoload :CacheAdapter, 'web_crawler/cache_adapter'
18
+ autoload :Configurable, 'web_crawler/configuration'
19
+ autoload :Configuration, 'web_crawler/configuration'
20
+
21
+ autoload :FactoryUrl, 'web_crawler/factory_url'
22
+ autoload :Follower, 'web_crawler/follower'
23
+ autoload :Parsers, 'web_crawler/parsers'
24
+ autoload :Utility, 'web_crawler/utility'
25
+
26
+ autoload :View, 'web_crawler/view'
27
+ autoload :CLI, 'web_crawler/cli'
28
+ autoload :Application, 'web_crawler/application'
29
+
30
+ include Configurable
31
+ extend Utility
32
+
33
+ def self.logger
34
+ config.logger
35
+ end
36
+
37
+ end
38
+
@@ -0,0 +1,44 @@
1
+ module FakeWebGenerator
2
+
3
+ def self.included(base)
4
+ generate_web(['http://otherhost.ru/1',
5
+ 'http://otherhost.ru/2',
6
+ 'http://otherhost.ru/3',
7
+ 'http://example.com/1',
8
+ 'http://example.com/2',
9
+ 'http://example.com/3',
10
+ 'http://example.com/2323.html',
11
+ 'http://example.com/2323.html?rr=1',
12
+ 'http://example.com/follower?rr=1'])
13
+
14
+ FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
15
+ end
16
+
17
+ def generate_web(urls)
18
+ @@known_web_urls ||= []
19
+ @@known_web_urls << urls
20
+ @@known_web_urls.flatten!
21
+ @@known_web_urls.uniq!
22
+
23
+ urls.each do |url|
24
+ FakeWeb.register_uri(:get, url, :body => "Example body for url #{url}")
25
+ end
26
+ end
27
+ module_function :generate_web
28
+
29
+ def follower_body
30
+ "Example body for http://example.com/follower" <<
31
+ @@known_web_urls.map { |u| "<a href='#{u}'>link text</a>" }.join("\n")
32
+ end
33
+ module_function :follower_body
34
+
35
+ def urls_board_path
36
+ 'http://example.com/follower'
37
+ end
38
+ module_function :urls_board_path
39
+
40
+ def known_urls
41
+ @@known_web_urls
42
+ end
43
+
44
+ end
@@ -0,0 +1,17 @@
1
+ $:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "/../lib")))
2
+
3
+ require 'rspec'
4
+ require "web_crawler"
5
+ require "fake_web"
6
+
7
+ require 'fake_web_generator'
8
+
9
+ RSpec.configure do |c|
10
+ c.mock_with :rspec
11
+ c.include FakeWebGenerator
12
+ end
13
+
14
+ WebCrawler.configure do
15
+ config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
16
+ end
17
+
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
4
+ FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
5
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
6
+
7
+ describe WebCrawler::BatchRequest do
8
+
9
+ let(:urls) { ['example.com', 'example.com/1', 'example.com/2'] }
10
+ let(:http_response) { Net::HTTPResponse.new('', '', '') }
11
+ let(:responses) { urls.map { |url| WebCrawler::Response.new(url, http_response) } }
12
+
13
+ def response(url)
14
+ WebCrawler::Response.new(url, http_response)
15
+ end
16
+
17
+ def request(url)
18
+ WebCrawler::Request.new(url).stub(:process).and_return(response(url))
19
+ end
20
+
21
+ subject { described_class.new(urls) }
22
+
23
+ it "should initialize batch of requests for given urls" do
24
+ subject.requests.should be_a Array
25
+ subject.requests.should have(3).members
26
+ subject.requests.all? { |r| r.is_a? WebCrawler::Request }.should be_true
27
+ end
28
+
29
+ it "should process requests" do
30
+ subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
31
+ subject.process.should be_a Array
32
+ subject.process.first.should be_a WebCrawler::Response
33
+ end
34
+
35
+ it "should accept :parser option with parser class or object" do
36
+ class ::TestParser
37
+ def parse(resp)
38
+ resp.to_s + ' parsed'
39
+ end
40
+ end
41
+ described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
42
+ "Example body1 parsed",
43
+ "Example body for url http://example.com/2 parsed"]
44
+ end
45
+ end
@@ -0,0 +1,31 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
4
+ FakeWeb.register_uri(:get, "http://example.com/2", :body => "Example body2")
5
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
6
+
7
+ FakeWeb.allow_net_connect = false
8
+
9
+ describe 'Cached requests' do
10
+
11
+ let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
12
+
13
+ it 'should not send requests to the web if cache exists' do
14
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
15
+ first_response = FakeWeb.response_for :get, "http://example.com/1"
16
+
17
+ FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
18
+
19
+ lambda {
20
+ WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
21
+ }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
22
+
23
+ FakeWeb.should_not_receive(:response_for)
24
+
25
+ WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
26
+
27
+ cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
28
+ WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
29
+ end
30
+
31
+ end
@@ -0,0 +1,34 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::FactoryUrl do
4
+
5
+ it "should generate urls with block" do
6
+ first_param = [1,2,3]
7
+ second_param = 10...15
8
+
9
+ factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
10
+ random = rand(3000)
11
+ "www.example.com/%s/%s.html?rid=#{random}" % args
12
+ end
13
+ urls = factory.factory
14
+
15
+ urls.should be_a Array
16
+ factory.params.size.should == 15
17
+ urls.should have(factory.params.size).urls
18
+ urls.first.should =~ /www\.example\.com\/1\/10\.html/
19
+ end
20
+
21
+ it "should generate urls with pattern" do
22
+ first_param = [1,2,3]
23
+ second_param = 10...15
24
+
25
+ factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
26
+ urls = factory.factory
27
+
28
+ urls.should be_a Array
29
+ factory.params.size.should == 15
30
+ urls.should have(factory.params.size).urls
31
+ urls.first.should == "www.example.com/1/10.html"
32
+ end
33
+
34
+ end
@@ -0,0 +1,32 @@
1
+ require "spec_helper"
2
+
3
+
4
+ describe WebCrawler::Follower do
5
+
6
+ it "should collect all uniques urls from responses" do
7
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
8
+ urls = WebCrawler::Follower.new(responses).collect
9
+
10
+ urls.first.should have(9).urls
11
+ urls.first.should == known_urls
12
+ end
13
+
14
+ it "should collect all the unique url with same host like in responses" do
15
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
16
+ urls = WebCrawler::Follower.new(responses, same_host: true).collect
17
+
18
+ urls.first.should have(6).urls
19
+ urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
20
+ end
21
+
22
+ it "should process requests for following urls" do
23
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
24
+ follower = WebCrawler::Follower.new responses
25
+ responses += follower.process
26
+
27
+ responses.should have(10).responses
28
+ responses.first.should be_a WebCrawler::Response
29
+ responses.first.url.to_s.should == urls_board_path
30
+ responses.last.url.to_s.should == known_urls.last
31
+ end
32
+ end
@@ -0,0 +1,29 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Request do
4
+
5
+ let(:success_url) { 'example.com/success' }
6
+ let(:failure_url) { 'example.com/failure' }
7
+
8
+ before(:each) do
9
+ @body = "Example body"
10
+ FakeWeb.register_uri(:get, "http://example.com/success", :body => @body, :status => ["200", "OK"])
11
+ FakeWeb.register_uri(:get, "http://example.com/failure", :body => @body, :status => ["503", "Internal error"])
12
+ end
13
+
14
+ subject { WebCrawler::Request.new(success_url) }
15
+
16
+ it "should fetch the url" do
17
+ subject.process.should be_a WebCrawler::Response
18
+ subject.process.body.should be @body
19
+ end
20
+
21
+ it "should be success" do
22
+ subject.process.should be_success
23
+ end
24
+
25
+ it "should be failure" do
26
+ WebCrawler::Request.new(failure_url).process.should be_failure
27
+ end
28
+
29
+ end
@@ -0,0 +1,27 @@
1
+ require "spec_helper"
2
+
3
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Example body")
4
+
5
+
6
+ describe WebCrawler::Response do
7
+
8
+ let(:url) { 'example.com' }
9
+ subject { WebCrawler::Request.new(url).process }
10
+
11
+ it "should initialize with url and response" do
12
+ described_class.new url, Net::HTTPResponse.new('', '', '')
13
+ end
14
+
15
+ it "should respond to HTTPResponse methods" do
16
+ [:body, :http_version, :code, :message, :msg, :code_type].each do |meth|
17
+ subject.should respond_to meth
18
+ end
19
+ end
20
+
21
+ it "#to_s should be String and equal to #body and not equal to #inspect" do
22
+ subject.to_s.should be_a String
23
+ subject.to_s.should be subject.body
24
+ subject.to_s.should_not be subject.inspect
25
+ end
26
+
27
+ end
@@ -0,0 +1,41 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Parsers::Url do
4
+
5
+ let(:host) { 'example.com' }
6
+ let(:http_host) { 'http://example.com' }
7
+ let(:https_host) { 'https://example.com/' }
8
+ let(:current_page) { '/news/1000.html' }
9
+
10
+ it "should add scheme to url" do
11
+ described_class.new(host).host.to_s.should == 'http://example.com'
12
+ described_class.new(host, secure: true).host.to_s.should == 'https://example.com'
13
+ end
14
+
15
+ it "should parse scheme from url and set @scheme" do
16
+ described_class.new(https_host).scheme.should == 'https'
17
+ described_class.new(host, secure: true).scheme.should == 'https'
18
+ described_class.new(http_host).scheme.should == 'http'
19
+ described_class.new(host).scheme.should == 'http'
20
+ end
21
+
22
+ it "should return nil if host not equal to initial host" do
23
+ described_class.new(host, same_host: true).normalize('example.ru/news?sid=1').should be_nil
24
+ described_class.new(host, same_host: true).normalize('http://example.ru/news?sid=1').should be_nil
25
+ described_class.new(host, same_host: true).normalize('https://example.ru/news?sid=1').should be_nil
26
+ end
27
+
28
+ it "should join request_uri to initial host" do
29
+ described_class.new(https_host).normalize('/news').should == 'https://example.com/news'
30
+ described_class.new(https_host).normalize('/news?sid=1').should == 'https://example.com/news?sid=1'
31
+ described_class.new(https_host).normalize('/news?sid=1#anchor').should == 'https://example.com/news?sid=1#anchor'
32
+ end
33
+
34
+ it "should join query string to initial current page" do
35
+ described_class.new(host, url: current_page).normalize('?sid=1').should == 'http://example.com/news/1000.html?sid=1'
36
+ end
37
+
38
+ it "should join fragment string to initial current page" do
39
+ described_class.new(host, url: current_page).normalize('#anchor').should == 'http://example.com/news/1000.html#anchor'
40
+ end
41
+ end
@@ -0,0 +1,95 @@
1
+ require "spec_helper"
2
+
3
+
4
+ describe WebCrawler::View::Csv do
5
+
6
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
7
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n" }] }
8
+
9
+ it "should render input array to csv string" do
10
+ described_class.new(input).render.should == "1,2,3\nstring,\"other string\n\"\n"
11
+ end
12
+
13
+ it "should render input hash to csv string" do
14
+ described_class.new(input_hash).render.should == "title,url,author\n1,2,3\nstring,\"other string\n\"\n"
15
+ end
16
+
17
+ it "should render input array to csv string with options" do
18
+ described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
+ described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
20
+ end
21
+
22
+ end
23
+
24
+ describe WebCrawler::View::Json do
25
+
26
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
27
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
28
+
29
+ it "should render input array to json string" do
30
+ described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
31
+ end
32
+
33
+ it "should render input hash to json string" do
34
+ json = described_class.new(input_hash).render
35
+ json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
36
+ hash = JSON.parse(json).symbolize_keys
37
+ hash[:responses].each(&:symbolize_keys!)
38
+ hash.should == { responses: input_hash }
39
+ end
40
+ end
41
+
42
+
43
+ describe WebCrawler::View::Xml do
44
+
45
+ let(:input) { [[1, 2, "3"], ["string", "other string\n"]] }
46
+ let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
47
+
48
+ it "should render input array to xml string" do
49
+ xml = "<responses>" <<
50
+ "<response><title>1</title><url>2</url><author>3</author></response>" <<
51
+ "<response><title>string</title><url>other string\n</url><author></author></response>" <<
52
+ "</responses>"
53
+ described_class.new(input, headers: [:title, :url, :author]).render.should == xml
54
+ end
55
+
56
+ it "should render input array to pretty xml string" do
57
+ xml = "<responses>\n" <<
58
+ "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
59
+ "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
60
+ "</responses>"
61
+ described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
62
+ end
63
+
64
+ it "should render input array without :headers to xml string" do
65
+ xml = "<responses>\n" <<
66
+ "<response><field_1>1</field_1><field_2>2</field_2><field_3>3</field_3></response>\n" <<
67
+ "<response><field_1>string</field_1><field_2>other string\n</field_2><field_3></field_3></response>\n" <<
68
+ "</responses>"
69
+ described_class.new(input, pretty: true).render.should == xml
70
+ end
71
+
72
+ it "should render input hash to xml string" do
73
+ xml = "<responses>\n" <<
74
+ "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
75
+ "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
76
+ "</responses>"
77
+ described_class.new(input_hash, pretty: true).render.should == xml
78
+ end
79
+ end
80
+
81
+ describe WebCrawler::View do
82
+
83
+ it "should factory a view from view type" do
84
+ WebCrawler::View.factory('json', [1, 2, 3]).should be_a WebCrawler::View::Json
85
+ WebCrawler::View.factory('xml', [1, 2, 3]).should be_a WebCrawler::View::Xml
86
+ WebCrawler::View.factory('table', [1, 2, 3]).should be_a WebCrawler::View::Table
87
+ end
88
+
89
+ it "should draw view to custom output" do
90
+ output = ""
91
+ io = StringIO.new(output)
92
+ WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
93
+ output.should == "{\"responses\":[[1,2,3]]}\n"
94
+ end
95
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "web_crawler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "web_crawler"
7
+ s.version = WebCrawler::VERSION::STRING
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Anton Sozontov"]
10
+ s.email = ["a.sozontov@gmail.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{Web crawler help you with parse and collect data from the web}
13
+ s.description = %q{Web crawler help you with parse and collect data from the web}
14
+
15
+ s.rubyforge_project = "web_crawler"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.has_rdoc = false
23
+
24
+ s.bindir = "bin"
25
+
26
+ s.add_dependency 'thor'
27
+
28
+ s.add_development_dependency(%q<rspec>, [">=2.6"])
29
+ s.add_development_dependency(%q<fakeweb>)
30
+ end