rawler 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +8 -0
- data/lib/rawler/base.rb +1 -1
- data/lib/rawler/crawler.rb +4 -2
- data/lib/rawler.rb +1 -1
- data/spec/lib/rawler/crawler_spec.rb +25 -7
- data/spec/lib/rawler_spec.rb +14 -11
- data/specs.watchr +0 -1
- metadata +23 -13
    
        data/Gemfile.lock
    CHANGED
    
    | @@ -3,7 +3,12 @@ GEM | |
| 3 3 | 
             
              specs:
         | 
| 4 4 | 
             
                diff-lcs (1.1.2)
         | 
| 5 5 | 
             
                fakeweb (1.3.0)
         | 
| 6 | 
            +
                hoe (2.6.2)
         | 
| 7 | 
            +
                  rake (>= 0.8.7)
         | 
| 8 | 
            +
                  rubyforge (>= 2.0.4)
         | 
| 9 | 
            +
                json_pure (1.5.1)
         | 
| 6 10 | 
             
                nokogiri (1.4.4)
         | 
| 11 | 
            +
                rake (0.8.7)
         | 
| 7 12 | 
             
                rspec (2.4.0)
         | 
| 8 13 | 
             
                  rspec-core (~> 2.4.0)
         | 
| 9 14 | 
             
                  rspec-expectations (~> 2.4.0)
         | 
| @@ -12,11 +17,14 @@ GEM | |
| 12 17 | 
             
                rspec-expectations (2.4.0)
         | 
| 13 18 | 
             
                  diff-lcs (~> 1.1.2)
         | 
| 14 19 | 
             
                rspec-mocks (2.4.0)
         | 
| 20 | 
            +
                rubyforge (2.0.4)
         | 
| 21 | 
            +
                  json_pure (>= 1.1.7)
         | 
| 15 22 |  | 
| 16 23 | 
             
            PLATFORMS
         | 
| 17 24 | 
             
              ruby
         | 
| 18 25 |  | 
| 19 26 | 
             
            DEPENDENCIES
         | 
| 20 27 | 
             
              fakeweb (= 1.3.0)
         | 
| 28 | 
            +
              hoe (= 2.6.2)
         | 
| 21 29 | 
             
              nokogiri (= 1.4.4)
         | 
| 22 30 | 
             
              rspec (= 2.4.0)
         | 
    
        data/lib/rawler/base.rb
    CHANGED
    
    
    
        data/lib/rawler/crawler.rb
    CHANGED
    
    | @@ -16,7 +16,7 @@ module Rawler | |
| 16 16 | 
             
                  response = Rawler::Request.get(url)
         | 
| 17 17 |  | 
| 18 18 | 
             
                  doc = Nokogiri::HTML(response.body)
         | 
| 19 | 
            -
                  doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
         | 
| 19 | 
            +
                  doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) }
         | 
| 20 20 | 
             
                rescue Errno::ECONNREFUSED
         | 
| 21 21 | 
             
                  write("Couldn't connect to #{url}")
         | 
| 22 22 | 
             
                  []
         | 
| @@ -28,9 +28,11 @@ module Rawler | |
| 28 28 | 
             
                private
         | 
| 29 29 |  | 
| 30 30 | 
             
                def absolute_url(path)
         | 
| 31 | 
            -
                  path.strip | 
| 31 | 
            +
                  path = URI.encode(path.strip)
         | 
| 32 32 | 
             
                  if path[0].chr == '/'
         | 
| 33 33 | 
             
                    URI.parse(url).merge(path.to_s).to_s
         | 
| 34 | 
            +
                  elsif URI.parse(path).scheme.nil?
         | 
| 35 | 
            +
                    URI.parse(url).merge("/#{path.to_s}").to_s
         | 
| 34 36 | 
             
                  else
         | 
| 35 37 | 
             
                    path
         | 
| 36 38 | 
             
                  end
         | 
    
        data/lib/rawler.rb
    CHANGED
    
    
| @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            # encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require File.dirname(__FILE__) + '/../../spec_helper.rb'
         | 
| 2 4 |  | 
| 3 5 | 
             
            describe Rawler::Crawler do
         | 
| @@ -36,14 +38,14 @@ describe Rawler::Crawler do | |
| 36 38 |  | 
| 37 39 | 
             
                let(:url)     { 'http://example.com/path' }
         | 
| 38 40 | 
             
                let(:crawler) { Rawler::Crawler.new(url) }
         | 
| 39 | 
            -
                let(:content) { '<a href="/foo">foo</a>' }
         | 
| 41 | 
            +
                let(:content) { '<a href="/foo">foo</a> <a href="bar">bar</a>' }
         | 
| 40 42 |  | 
| 41 43 | 
             
                before(:each) do
         | 
| 42 44 | 
             
                  register(url, content)
         | 
| 43 45 | 
             
                end
         | 
| 44 46 |  | 
| 45 47 | 
             
                it "should parse relative links" do
         | 
| 46 | 
            -
                  crawler.links.should == ['http://example.com/foo']
         | 
| 48 | 
            +
                  crawler.links.should == ['http://example.com/foo', 'http://example.com/bar']
         | 
| 47 49 | 
             
                end
         | 
| 48 50 |  | 
| 49 51 | 
             
              end
         | 
| @@ -75,8 +77,24 @@ describe Rawler::Crawler do | |
| 75 77 | 
             
                  register(url, content)
         | 
| 76 78 | 
             
                end
         | 
| 77 79 |  | 
| 78 | 
            -
                it "should parse  | 
| 79 | 
            -
                  crawler.links.should == ['http://example.com/foo | 
| 80 | 
            +
                it "should parse urls with hashtags" do
         | 
| 81 | 
            +
                  crawler.links.should == ['http://example.com/foo%23bar']
         | 
| 82 | 
            +
                end
         | 
| 83 | 
            +
                
         | 
| 84 | 
            +
              end
         | 
| 85 | 
            +
              
         | 
| 86 | 
            +
              context "urls with unicode characters" do
         | 
| 87 | 
            +
                
         | 
| 88 | 
            +
                let(:url)     { 'http://example.com' }
         | 
| 89 | 
            +
                let(:crawler) { Rawler::Crawler.new(url) }
         | 
| 90 | 
            +
                let(:content) { '<a href="http://example.com/写程序容易出现的几个不好的地方">foo</a>' }
         | 
| 91 | 
            +
                
         | 
| 92 | 
            +
                before(:each) do
         | 
| 93 | 
            +
                  register(url, content)
         | 
| 94 | 
            +
                end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                it "should parse unicode links" do
         | 
| 97 | 
            +
                  crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9']
         | 
| 80 98 | 
             
                end
         | 
| 81 99 |  | 
| 82 100 | 
             
              end
         | 
| @@ -85,7 +103,7 @@ describe Rawler::Crawler do | |
| 85 103 | 
             
                let(:url)     { 'http://example.com/path' }
         | 
| 86 104 | 
             
                let(:crawler) { Rawler::Crawler.new(url) }
         | 
| 87 105 | 
             
                let(:js_url)  { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" }
         | 
| 88 | 
            -
                let(:content) { "<a href=\"#{js_url}\">foo</a>" }
         | 
| 106 | 
            +
                let(:content) { "<a href=\"#{js_url}\">foo</a><a name=\"foo\">" }
         | 
| 89 107 |  | 
| 90 108 | 
             
                before(:each) do
         | 
| 91 109 | 
             
                  register(url, content)
         | 
| @@ -94,9 +112,9 @@ describe Rawler::Crawler do | |
| 94 112 | 
             
                it "should parse relative links" do
         | 
| 95 113 | 
             
                  crawler.links.should == []
         | 
| 96 114 | 
             
                end
         | 
| 97 | 
            -
             | 
| 115 | 
            +
             | 
| 98 116 | 
             
                it "should report the error" do
         | 
| 99 | 
            -
                  crawler.should_receive(:write).with("Invalid url -  | 
| 117 | 
            +
                  crawler.should_receive(:write).with("Invalid url - javascript:fn('nbjmup;jhfs.esf%7Bfio/dpn');")
         | 
| 100 118 | 
             
                  crawler.links
         | 
| 101 119 | 
             
                end
         | 
| 102 120 | 
             
              end
         | 
    
        data/spec/lib/rawler_spec.rb
    CHANGED
    
    | @@ -1,3 +1,5 @@ | |
| 1 | 
            +
            # encoding: UTF-8
         | 
| 2 | 
            +
             | 
| 1 3 | 
             
            require File.dirname(__FILE__) + '/../spec_helper.rb'
         | 
| 2 4 |  | 
| 3 5 | 
             
            describe Rawler::Base do
         | 
| @@ -9,6 +11,16 @@ describe Rawler::Base do | |
| 9 11 | 
             
                Rawler.stub!(:output).and_return(output)
         | 
| 10 12 | 
             
                register('http://example.com', site)
         | 
| 11 13 | 
             
              end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              describe "url encoding" do
         | 
| 16 | 
            +
                it "should encode url" do
         | 
| 17 | 
            +
                  original = 'http://example.com/写程序容易出现的几个不好的地方'
         | 
| 18 | 
            +
                  expected = 'http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  Rawler::Base.new(original, output)
         | 
| 21 | 
            +
                  Rawler.url.should == expected
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
              end
         | 
| 12 24 |  | 
| 13 25 | 
             
              describe "validate_links" do
         | 
| 14 26 |  | 
| @@ -50,16 +62,7 @@ describe Rawler::Base do | |
| 50 62 |  | 
| 51 63 | 
             
                  rawler.validate
         | 
| 52 64 | 
             
                end
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                it "should validate links with #hashtags" do
         | 
| 55 | 
            -
                  register('http://example.com/foo1', '<a href="http://example.com/page-with#hashtag">x</a>')
         | 
| 56 | 
            -
                  register('http://example.com/page-with', '')
         | 
| 57 | 
            -
                  
         | 
| 58 | 
            -
                  output.should_receive(:info).with('200 - http://example.com/page-with#hashtag')
         | 
| 59 | 
            -
                  
         | 
| 60 | 
            -
                  rawler.validate
         | 
| 61 | 
            -
                end
         | 
| 62 | 
            -
                        
         | 
| 65 | 
            +
                            
         | 
| 63 66 | 
             
              end
         | 
| 64 67 |  | 
| 65 68 | 
             
              describe "get_status_code" do
         | 
| @@ -200,4 +203,4 @@ describe Rawler::Base do | |
| 200 203 | 
             
                site
         | 
| 201 204 | 
             
              end
         | 
| 202 205 |  | 
| 203 | 
            -
            end
         | 
| 206 | 
            +
            end
         | 
    
        data/specs.watchr
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,12 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: rawler
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
               | 
| 5 | 
            -
              prerelease: 
         | 
| 4 | 
            +
              prerelease: false
         | 
| 6 5 | 
             
              segments: 
         | 
| 7 6 | 
             
              - 0
         | 
| 8 7 | 
             
              - 0
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.0. | 
| 8 | 
            +
              - 7
         | 
| 9 | 
            +
              version: 0.0.7
         | 
| 11 10 | 
             
            platform: ruby
         | 
| 12 11 | 
             
            authors: 
         | 
| 13 12 | 
             
            - Oscar Del Ben
         | 
| @@ -15,7 +14,7 @@ autorequire: | |
| 15 14 | 
             
            bindir: bin
         | 
| 16 15 | 
             
            cert_chain: []
         | 
| 17 16 |  | 
| 18 | 
            -
            date: 2011- | 
| 17 | 
            +
            date: 2011-03-07 00:00:00 +01:00
         | 
| 19 18 | 
             
            default_executable: 
         | 
| 20 19 | 
             
            dependencies: 
         | 
| 21 20 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -26,28 +25,41 @@ dependencies: | |
| 26 25 | 
             
                requirements: 
         | 
| 27 26 | 
             
                - - ">="
         | 
| 28 27 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 29 | 
            -
                    hash: 3
         | 
| 30 28 | 
             
                    segments: 
         | 
| 31 29 | 
             
                    - 0
         | 
| 32 30 | 
             
                    version: "0"
         | 
| 33 31 | 
             
              type: :runtime
         | 
| 34 32 | 
             
              version_requirements: *id001
         | 
| 35 33 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 36 | 
            -
              name:  | 
| 34 | 
            +
              name: rubyforge
         | 
| 37 35 | 
             
              prerelease: false
         | 
| 38 36 | 
             
              requirement: &id002 !ruby/object:Gem::Requirement 
         | 
| 39 37 | 
             
                none: false
         | 
| 40 38 | 
             
                requirements: 
         | 
| 41 39 | 
             
                - - ">="
         | 
| 42 40 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 43 | 
            -
                    hash: 47
         | 
| 44 41 | 
             
                    segments: 
         | 
| 45 42 | 
             
                    - 2
         | 
| 46 | 
            -
                    - 8
         | 
| 47 43 | 
             
                    - 0
         | 
| 48 | 
            -
                     | 
| 44 | 
            +
                    - 4
         | 
| 45 | 
            +
                    version: 2.0.4
         | 
| 49 46 | 
             
              type: :development
         | 
| 50 47 | 
             
              version_requirements: *id002
         | 
| 48 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 49 | 
            +
              name: hoe
         | 
| 50 | 
            +
              prerelease: false
         | 
| 51 | 
            +
              requirement: &id003 !ruby/object:Gem::Requirement 
         | 
| 52 | 
            +
                none: false
         | 
| 53 | 
            +
                requirements: 
         | 
| 54 | 
            +
                - - ">="
         | 
| 55 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 56 | 
            +
                    segments: 
         | 
| 57 | 
            +
                    - 2
         | 
| 58 | 
            +
                    - 6
         | 
| 59 | 
            +
                    - 2
         | 
| 60 | 
            +
                    version: 2.6.2
         | 
| 61 | 
            +
              type: :development
         | 
| 62 | 
            +
              version_requirements: *id003
         | 
| 51 63 | 
             
            description: |-
         | 
| 52 64 | 
             
              Rawler is a Ruby library that crawls your website and checks the status code for each of your links. Useful for finding dead links.
         | 
| 53 65 |  | 
| @@ -100,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 100 112 | 
             
              requirements: 
         | 
| 101 113 | 
             
              - - ">="
         | 
| 102 114 | 
             
                - !ruby/object:Gem::Version 
         | 
| 103 | 
            -
                  hash: 3
         | 
| 104 115 | 
             
                  segments: 
         | 
| 105 116 | 
             
                  - 0
         | 
| 106 117 | 
             
                  version: "0"
         | 
| @@ -109,14 +120,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 109 120 | 
             
              requirements: 
         | 
| 110 121 | 
             
              - - ">="
         | 
| 111 122 | 
             
                - !ruby/object:Gem::Version 
         | 
| 112 | 
            -
                  hash: 3
         | 
| 113 123 | 
             
                  segments: 
         | 
| 114 124 | 
             
                  - 0
         | 
| 115 125 | 
             
                  version: "0"
         | 
| 116 126 | 
             
            requirements: []
         | 
| 117 127 |  | 
| 118 128 | 
             
            rubyforge_project: oscardelben
         | 
| 119 | 
            -
            rubygems_version: 1. | 
| 129 | 
            +
            rubygems_version: 1.3.7
         | 
| 120 130 | 
             
            signing_key: 
         | 
| 121 131 | 
             
            specification_version: 3
         | 
| 122 132 | 
             
            summary: Rawler is a Ruby library that crawls your website and checks the status code for each of your links
         |