wombat 2.1.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
 - data/lib/wombat/dsl/metadata.rb +5 -1
 - data/lib/wombat/processing/parser.rb +5 -3
 - data/spec/integration/integration_spec.rb +32 -1
 - data/spec/processing/parser_spec.rb +16 -3
 - data/wombat.gemspec +2 -2
 - metadata +2 -2
 
    
        data/VERSION
    CHANGED
    
    | 
         @@ -1 +1 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            2. 
     | 
| 
      
 1 
     | 
    
         
            +
            2.2.0
         
     | 
    
        data/lib/wombat/dsl/metadata.rb
    CHANGED
    
    
| 
         @@ -34,12 +34,14 @@ module Wombat 
     | 
|
| 
       34 
34 
     | 
    
         
             
                    page = nil
         
     | 
| 
       35 
35 
     | 
    
         
             
                    parser = nil
         
     | 
| 
       36 
36 
     | 
    
         
             
                    begin
         
     | 
| 
      
 37 
     | 
    
         
            +
                      @page = metadata[:page]
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
       37 
39 
     | 
    
         
             
                      if metadata[:document_format] == :html
         
     | 
| 
       38 
     | 
    
         
            -
                        @page = @mechanize.get(url)
         
     | 
| 
      
 40 
     | 
    
         
            +
                        @page = @mechanize.get(url) unless @page
         
     | 
| 
       39 
41 
     | 
    
         
             
                        parser = @page.parser
         
     | 
| 
       40 
42 
     | 
    
         
             
                        parser.headers = @page.header
         
     | 
| 
       41 
43 
     | 
    
         
             
                      else
         
     | 
| 
       42 
     | 
    
         
            -
                        @page = RestClient.get(url)
         
     | 
| 
      
 44 
     | 
    
         
            +
                        @page = RestClient.get(url) unless @page
         
     | 
| 
       43 
45 
     | 
    
         
             
                        parser = Nokogiri::XML @page
         
     | 
| 
       44 
46 
     | 
    
         
             
                        parser.headers = @page.headers
         
     | 
| 
       45 
47 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -56,4 +58,4 @@ module Wombat 
     | 
|
| 
       56 
58 
     | 
    
         
             
                  end
         
     | 
| 
       57 
59 
     | 
    
         
             
                end
         
     | 
| 
       58 
60 
     | 
    
         
             
              end
         
     | 
| 
       59 
     | 
    
         
            -
            end
         
     | 
| 
      
 61 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -32,6 +32,37 @@ describe 'basic crawler setup' do 
     | 
|
| 
       32 
32 
     | 
    
         
             
                end
         
     | 
| 
       33 
33 
     | 
    
         
             
              end
         
     | 
| 
       34 
34 
     | 
    
         | 
| 
      
 35 
     | 
    
         
            +
              it 'should crawl a Mechanize::Page' do
         
     | 
| 
      
 36 
     | 
    
         
            +
                VCR.use_cassette('basic_crawler_page') do
         
     | 
| 
      
 37 
     | 
    
         
            +
                  crawler = Class.new
         
     | 
| 
      
 38 
     | 
    
         
            +
                  crawler.send(:include, Wombat::Crawler)
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                  m = Mechanize.new
         
     | 
| 
      
 41 
     | 
    
         
            +
                  mp = m.get "http://www.terra.com.br/portal"
         
     | 
| 
      
 42 
     | 
    
         
            +
                  crawler.page mp
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                  crawler.search "css=.btn-search"
         
     | 
| 
      
 45 
     | 
    
         
            +
                  crawler.social do
         
     | 
| 
      
 46 
     | 
    
         
            +
                    twitter "css=.ctn-bar li.last"
         
     | 
| 
      
 47 
     | 
    
         
            +
                  end
         
     | 
| 
      
 48 
     | 
    
         
            +
                  crawler.links "css=.ctn-links", :iterator do
         
     | 
| 
      
 49 
     | 
    
         
            +
                    menu "css=a"
         
     | 
| 
      
 50 
     | 
    
         
            +
                  end
         
     | 
| 
      
 51 
     | 
    
         
            +
                  crawler.subheader "css=h2.ttl-dynamic" do |h|
         
     | 
| 
      
 52 
     | 
    
         
            +
                    h.gsub("London", "Londres")
         
     | 
| 
      
 53 
     | 
    
         
            +
                  end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                  crawler_instance = crawler.new
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                  results = crawler_instance.crawl
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                  results["search"].should == "Buscar"
         
     | 
| 
      
 60 
     | 
    
         
            +
                  results["links"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}]
         
     | 
| 
      
 61 
     | 
    
         
            +
                  results["subheader"].should == "Londres 2012"
         
     | 
| 
      
 62 
     | 
    
         
            +
                  results["social"]["twitter"].should == "Verão"
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
              end
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
       35 
66 
     | 
    
         
             
              it 'should support hash based selectors' do
         
     | 
| 
       36 
67 
     | 
    
         
             
                VCR.use_cassette('basic_crawler_page') do
         
     | 
| 
       37 
68 
     | 
    
         
             
                  crawler = Class.new
         
     | 
| 
         @@ -242,4 +273,4 @@ describe 'basic crawler setup' do 
     | 
|
| 
       242 
273 
     | 
    
         
             
                  }
         
     | 
| 
       243 
274 
     | 
    
         
             
                end
         
     | 
| 
       244 
275 
     | 
    
         
             
              end
         
     | 
| 
       245 
     | 
    
         
            -
            end
         
     | 
| 
      
 276 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -18,7 +18,7 @@ describe Wombat::Processing::Parser do 
     | 
|
| 
       18 
18 
     | 
    
         
             
                fake_document.should_receive(:header).and_return(fake_header)
         
     | 
| 
       19 
19 
     | 
    
         
             
                fake_parser.should_receive(:headers=)
         
     | 
| 
       20 
20 
     | 
    
         
             
                @parser.mechanize.should_receive(:get).with("http://www.google.com/search").and_return fake_document
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
       22 
22 
     | 
    
         
             
                @parser.parse @metadata
         
     | 
| 
       23 
23 
     | 
    
         
             
              end
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
         @@ -32,7 +32,20 @@ describe Wombat::Processing::Parser do 
     | 
|
| 
       32 
32 
     | 
    
         
             
                Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
         
     | 
| 
       33 
33 
     | 
    
         
             
                fake_document.should_receive(:headers).and_return(fake_headers)
         
     | 
| 
       34 
34 
     | 
    
         
             
                fake_parser.should_receive(:headers=)
         
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
       36 
36 
     | 
    
         
             
                @parser.parse @metadata
         
     | 
| 
       37 
37 
     | 
    
         
             
              end
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
              it 'should accept a Mechanize::Page' do
         
     | 
| 
      
 40 
     | 
    
         
            +
                VCR.use_cassette('basic_crawler_page') do
         
     | 
| 
      
 41 
     | 
    
         
            +
                  m = Mechanize.new
         
     | 
| 
      
 42 
     | 
    
         
            +
                  page = m.get('http://www.terra.com.br/portal')
         
     | 
| 
      
 43 
     | 
    
         
            +
                  @metadata.page page
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                  @parser.mechanize.should_not_receive(:get)
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  @parser.parse @metadata
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
              end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            end
         
     | 
    
        data/wombat.gemspec
    CHANGED
    
    | 
         @@ -5,11 +5,11 @@ 
     | 
|
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            Gem::Specification.new do |s|
         
     | 
| 
       7 
7 
     | 
    
         
             
              s.name = "wombat"
         
     | 
| 
       8 
     | 
    
         
            -
              s.version = "2. 
     | 
| 
      
 8 
     | 
    
         
            +
              s.version = "2.2.0"
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
10 
     | 
    
         
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         
     | 
| 
       11 
11 
     | 
    
         
             
              s.authors = ["Felipe Lima"]
         
     | 
| 
       12 
     | 
    
         
            -
              s.date = "2013- 
     | 
| 
      
 12 
     | 
    
         
            +
              s.date = "2013-06-06"
         
     | 
| 
       13 
13 
     | 
    
         
             
              s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
         
     | 
| 
       14 
14 
     | 
    
         
             
              s.email = "felipe.lima@gmail.com"
         
     | 
| 
       15 
15 
     | 
    
         
             
              s.extra_rdoc_files = [
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: wombat
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 2. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 2.2.0
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2013- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2013-06-06 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: mechanize
         
     |