cobweb 0.0.25 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +3 -7
- data/lib/cobweb_crawler.rb +0 -2
- data/lib/content_link_parser.rb +2 -4
- data/lib/crawl_job.rb +0 -1
- metadata +19 -30
    
        data/README.textile
    CHANGED
    
    
    
        data/lib/cobweb.rb
    CHANGED
    
    | @@ -19,7 +19,7 @@ class Cobweb | |
| 19 19 | 
             
              # investigate using event machine for single threaded crawling
         | 
| 20 20 |  | 
| 21 21 | 
             
              def self.version
         | 
| 22 | 
            -
                "0.0. | 
| 22 | 
            +
                "0.0.26"
         | 
| 23 23 | 
             
              end
         | 
| 24 24 |  | 
| 25 25 | 
             
              def method_missing(method_sym, *arguments, &block)
         | 
| @@ -77,8 +77,6 @@ class Cobweb | |
| 77 77 |  | 
| 78 78 | 
             
                raise "url cannot be nil" if url.nil?
         | 
| 79 79 |  | 
| 80 | 
            -
                absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
         | 
| 81 | 
            -
                    
         | 
| 82 80 | 
             
                # get the unique id for this request
         | 
| 83 81 | 
             
                unique_id = Digest::SHA1.hexdigest(url.to_s)
         | 
| 84 82 | 
             
                if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
         | 
| @@ -127,7 +125,7 @@ class Cobweb | |
| 127 125 | 
             
                      puts "redirected... " unless @options[:quiet]
         | 
| 128 126 |  | 
| 129 127 | 
             
                      # get location to redirect to
         | 
| 130 | 
            -
                      url =  | 
| 128 | 
            +
                      url = Addressable::URI.parse(response['location']).to_s
         | 
| 131 129 |  | 
| 132 130 | 
             
                      # decrement redirect limit
         | 
| 133 131 | 
             
                      redirect_limit = redirect_limit - 1
         | 
| @@ -234,8 +232,6 @@ class Cobweb | |
| 234 232 | 
             
              def head(url, options = @options)
         | 
| 235 233 | 
             
                raise "url cannot be nil" if url.nil?    
         | 
| 236 234 |  | 
| 237 | 
            -
                absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         | 
| 238 | 
            -
             | 
| 239 235 | 
             
                # get the unique id for this request
         | 
| 240 236 | 
             
                unique_id = Digest::SHA1.hexdigest(url)
         | 
| 241 237 | 
             
                if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
         | 
| @@ -277,7 +273,7 @@ class Cobweb | |
| 277 273 |  | 
| 278 274 | 
             
                    if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
         | 
| 279 275 | 
             
                      puts "redirected... " unless @options[:quiet]
         | 
| 280 | 
            -
                      url =  | 
| 276 | 
            +
                      url = Addressable::URI.parse(response['location']).to_s
         | 
| 281 277 | 
             
                      redirect_limit = redirect_limit - 1
         | 
| 282 278 | 
             
                      options = options.clone
         | 
| 283 279 | 
             
                      options[:redirect_limit]=redirect_limit
         | 
    
        data/lib/cobweb_crawler.rb
    CHANGED
    
    | @@ -22,8 +22,6 @@ class CobwebCrawler | |
| 22 22 |  | 
| 23 23 | 
             
                @crawl_options = crawl_options
         | 
| 24 24 |  | 
| 25 | 
            -
                @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         | 
| 26 | 
            -
                
         | 
| 27 25 | 
             
                @redis.sadd "queued", base_url
         | 
| 28 26 | 
             
                crawl_counter = @redis.scard("crawled").to_i
         | 
| 29 27 | 
             
                queue_counter = @redis.scard("queued").to_i
         | 
    
        data/lib/content_link_parser.rb
    CHANGED
    
    | @@ -12,7 +12,6 @@ class ContentLinkParser | |
| 12 12 | 
             
                if @doc.at("base[href]")
         | 
| 13 13 | 
             
                  base_url = @doc.at("base[href]").attr("href").to_s
         | 
| 14 14 | 
             
                end
         | 
| 15 | 
            -
                @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         | 
| 16 15 |  | 
| 17 16 | 
             
                @options[:tags] = {}
         | 
| 18 17 | 
             
                @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
         | 
| @@ -57,15 +56,14 @@ class ContentLinkParser | |
| 57 56 | 
             
                if attribute.kind_of? String or attribute.kind_of? Symbol
         | 
| 58 57 | 
             
                  @doc.css(selector).each do |tag|
         | 
| 59 58 | 
             
                    begin
         | 
| 60 | 
            -
                       | 
| 61 | 
            -
                      array << uri.to_s
         | 
| 59 | 
            +
                      array << Addressable::URI.parse(tag[attribute]).to_s
         | 
| 62 60 | 
             
                    rescue
         | 
| 63 61 | 
             
                    end
         | 
| 64 62 | 
             
                  end
         | 
| 65 63 | 
             
                elsif attribute.instance_of? Regexp
         | 
| 66 64 | 
             
                  @doc.css(selector).each do |tag|
         | 
| 67 65 | 
             
                    begin
         | 
| 68 | 
            -
                      tag.content.scan(attribute) {|match| array <<  | 
| 66 | 
            +
                      tag.content.scan(attribute) {|match| array << Addressable::URI.parse(match[0]).to_s}
         | 
| 69 67 | 
             
                    rescue
         | 
| 70 68 | 
             
                    end
         | 
| 71 69 | 
             
                  end
         | 
    
        data/lib/crawl_job.rb
    CHANGED
    
    | @@ -13,7 +13,6 @@ class CrawlJob | |
| 13 13 |  | 
| 14 14 | 
             
                @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
         | 
| 15 15 |  | 
| 16 | 
            -
                @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         | 
| 17 16 | 
             
                @debug = content_request[:debug]
         | 
| 18 17 |  | 
| 19 18 | 
             
                refresh_counters
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: cobweb
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.26
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z | |
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: resque
         | 
| 16 | 
            -
              requirement: & | 
| 16 | 
            +
              requirement: &70294818287920 !ruby/object:Gem::Requirement
         | 
| 17 17 | 
             
                none: false
         | 
| 18 18 | 
             
                requirements:
         | 
| 19 19 | 
             
                - - ! '>='
         | 
| @@ -21,10 +21,10 @@ dependencies: | |
| 21 21 | 
             
                    version: '0'
         | 
| 22 22 | 
             
              type: :runtime
         | 
| 23 23 | 
             
              prerelease: false
         | 
| 24 | 
            -
              version_requirements: * | 
| 24 | 
            +
              version_requirements: *70294818287920
         | 
| 25 25 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 26 26 | 
             
              name: redis
         | 
| 27 | 
            -
              requirement: & | 
| 27 | 
            +
              requirement: &70294818287500 !ruby/object:Gem::Requirement
         | 
| 28 28 | 
             
                none: false
         | 
| 29 29 | 
             
                requirements:
         | 
| 30 30 | 
             
                - - ! '>='
         | 
| @@ -32,10 +32,10 @@ dependencies: | |
| 32 32 | 
             
                    version: '0'
         | 
| 33 33 | 
             
              type: :runtime
         | 
| 34 34 | 
             
              prerelease: false
         | 
| 35 | 
            -
              version_requirements: * | 
| 35 | 
            +
              version_requirements: *70294818287500
         | 
| 36 36 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 37 37 | 
             
              name: nokogiri
         | 
| 38 | 
            -
              requirement: & | 
| 38 | 
            +
              requirement: &70294818287080 !ruby/object:Gem::Requirement
         | 
| 39 39 | 
             
                none: false
         | 
| 40 40 | 
             
                requirements:
         | 
| 41 41 | 
             
                - - ! '>='
         | 
| @@ -43,21 +43,10 @@ dependencies: | |
| 43 43 | 
             
                    version: '0'
         | 
| 44 44 | 
             
              type: :runtime
         | 
| 45 45 | 
             
              prerelease: false
         | 
| 46 | 
            -
              version_requirements: * | 
| 47 | 
            -
            - !ruby/object:Gem::Dependency
         | 
| 48 | 
            -
              name: absolutize
         | 
| 49 | 
            -
              requirement: &70349719635660 !ruby/object:Gem::Requirement
         | 
| 50 | 
            -
                none: false
         | 
| 51 | 
            -
                requirements:
         | 
| 52 | 
            -
                - - ! '>='
         | 
| 53 | 
            -
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            -
                    version: '0'
         | 
| 55 | 
            -
              type: :runtime
         | 
| 56 | 
            -
              prerelease: false
         | 
| 57 | 
            -
              version_requirements: *70349719635660
         | 
| 46 | 
            +
              version_requirements: *70294818287080
         | 
| 58 47 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 59 48 | 
             
              name: addressable
         | 
| 60 | 
            -
              requirement: & | 
| 49 | 
            +
              requirement: &70294818286640 !ruby/object:Gem::Requirement
         | 
| 61 50 | 
             
                none: false
         | 
| 62 51 | 
             
                requirements:
         | 
| 63 52 | 
             
                - - ! '>='
         | 
| @@ -65,10 +54,10 @@ dependencies: | |
| 65 54 | 
             
                    version: '0'
         | 
| 66 55 | 
             
              type: :runtime
         | 
| 67 56 | 
             
              prerelease: false
         | 
| 68 | 
            -
              version_requirements: * | 
| 57 | 
            +
              version_requirements: *70294818286640
         | 
| 69 58 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 70 59 | 
             
              name: rspec
         | 
| 71 | 
            -
              requirement: & | 
| 60 | 
            +
              requirement: &70294818286220 !ruby/object:Gem::Requirement
         | 
| 72 61 | 
             
                none: false
         | 
| 73 62 | 
             
                requirements:
         | 
| 74 63 | 
             
                - - ! '>='
         | 
| @@ -76,10 +65,10 @@ dependencies: | |
| 76 65 | 
             
                    version: '0'
         | 
| 77 66 | 
             
              type: :runtime
         | 
| 78 67 | 
             
              prerelease: false
         | 
| 79 | 
            -
              version_requirements: * | 
| 68 | 
            +
              version_requirements: *70294818286220
         | 
| 80 69 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 81 70 | 
             
              name: awesome_print
         | 
| 82 | 
            -
              requirement: & | 
| 71 | 
            +
              requirement: &70294818285800 !ruby/object:Gem::Requirement
         | 
| 83 72 | 
             
                none: false
         | 
| 84 73 | 
             
                requirements:
         | 
| 85 74 | 
             
                - - ! '>='
         | 
| @@ -87,10 +76,10 @@ dependencies: | |
| 87 76 | 
             
                    version: '0'
         | 
| 88 77 | 
             
              type: :runtime
         | 
| 89 78 | 
             
              prerelease: false
         | 
| 90 | 
            -
              version_requirements: * | 
| 79 | 
            +
              version_requirements: *70294818285800
         | 
| 91 80 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 92 81 | 
             
              name: sinatra
         | 
| 93 | 
            -
              requirement: & | 
| 82 | 
            +
              requirement: &70294818285380 !ruby/object:Gem::Requirement
         | 
| 94 83 | 
             
                none: false
         | 
| 95 84 | 
             
                requirements:
         | 
| 96 85 | 
             
                - - ! '>='
         | 
| @@ -98,10 +87,10 @@ dependencies: | |
| 98 87 | 
             
                    version: '0'
         | 
| 99 88 | 
             
              type: :runtime
         | 
| 100 89 | 
             
              prerelease: false
         | 
| 101 | 
            -
              version_requirements: * | 
| 90 | 
            +
              version_requirements: *70294818285380
         | 
| 102 91 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 103 92 | 
             
              name: thin
         | 
| 104 | 
            -
              requirement: & | 
| 93 | 
            +
              requirement: &70294818284960 !ruby/object:Gem::Requirement
         | 
| 105 94 | 
             
                none: false
         | 
| 106 95 | 
             
                requirements:
         | 
| 107 96 | 
             
                - - ! '>='
         | 
| @@ -109,10 +98,10 @@ dependencies: | |
| 109 98 | 
             
                    version: '0'
         | 
| 110 99 | 
             
              type: :runtime
         | 
| 111 100 | 
             
              prerelease: false
         | 
| 112 | 
            -
              version_requirements: * | 
| 101 | 
            +
              version_requirements: *70294818284960
         | 
| 113 102 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 114 103 | 
             
              name: haml
         | 
| 115 | 
            -
              requirement: & | 
| 104 | 
            +
              requirement: &70294818284540 !ruby/object:Gem::Requirement
         | 
| 116 105 | 
             
                none: false
         | 
| 117 106 | 
             
                requirements:
         | 
| 118 107 | 
             
                - - ! '>='
         | 
| @@ -120,7 +109,7 @@ dependencies: | |
| 120 109 | 
             
                    version: '0'
         | 
| 121 110 | 
             
              type: :runtime
         | 
| 122 111 | 
             
              prerelease: false
         | 
| 123 | 
            -
              version_requirements: * | 
| 112 | 
            +
              version_requirements: *70294818284540
         | 
| 124 113 | 
             
            description: Web Crawler that uses resque background job engine to allow you to cluster
         | 
| 125 114 | 
             
              your crawl.
         | 
| 126 115 | 
             
            email: stewart@rockwellcottage.com
         |