arachnid 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/arachnid.rb +13 -4
- metadata +70 -58
    
        data/lib/arachnid.rb
    CHANGED
    
    | @@ -47,7 +47,7 @@ class Arachnid | |
| 47 47 | 
             
            						links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
         | 
| 48 48 |  | 
| 49 49 | 
             
            						links.each do |link|
         | 
| 50 | 
            -
            							if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
         | 
| 50 | 
            +
            							if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
         | 
| 51 51 |  | 
| 52 52 | 
             
            								sanitized_link = sanitize_link(split_url_at_hash(link))
         | 
| 53 53 | 
             
            								if(sanitized_link)
         | 
| @@ -83,15 +83,23 @@ class Arachnid | |
| 83 83 |  | 
| 84 84 | 
             
            		begin
         | 
| 85 85 | 
             
            			parsed_domain = Domainatrix.parse(url)
         | 
| 86 | 
            -
             | 
| 86 | 
            +
             | 
| 87 | 
            +
            			if(parsed_domain.subdomain != "")
         | 
| 88 | 
            +
            				parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
         | 
| 89 | 
            +
            			else
         | 
| 90 | 
            +
            				parsed_domain.domain + '.' + parsed_domain.public_suffix
         | 
| 91 | 
            +
            			end
         | 
| 87 92 | 
             
            		rescue NoMethodError, Addressable::URI::InvalidURIError => e
         | 
| 88 93 | 
             
            			puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
         | 
| 89 94 | 
             
            			return nil
         | 
| 90 95 | 
             
            		end
         | 
| 91 96 | 
             
            	end
         | 
| 92 97 |  | 
| 93 | 
            -
            	def internal_link?(url)
         | 
| 94 | 
            -
             | 
| 98 | 
            +
            	def internal_link?(url, effective_url)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            		absolute_url = make_absolute(url, effective_url)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            		parsed_url = parse_domain(absolute_url)
         | 
| 95 103 | 
             
            		if(@domain == parsed_url)
         | 
| 96 104 | 
             
            			return true
         | 
| 97 105 | 
             
            		else
         | 
| @@ -117,6 +125,7 @@ class Arachnid | |
| 117 125 | 
             
            	end
         | 
| 118 126 |  | 
| 119 127 | 
             
            	def no_image_in_url?(url)
         | 
| 128 | 
            +
            		return true if url.to_s.length == 0
         | 
| 120 129 | 
             
            		return true unless @exclude_urls_with_images
         | 
| 121 130 |  | 
| 122 131 | 
             
            		extensions = ['.jpg', '.gif', '.png', '.jpeg']
         | 
    
        metadata
    CHANGED
    
    | @@ -1,99 +1,111 @@ | |
| 1 | 
            -
            --- !ruby/object:Gem::Specification | 
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: arachnid
         | 
| 3 | 
            -
            version: !ruby/object:Gem::Version | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.3.0
         | 
| 4 5 | 
             
              prerelease: 
         | 
| 5 | 
            -
              version: 0.2.2
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 | 
            -
            authors: | 
| 7 | 
            +
            authors:
         | 
| 8 8 | 
             
            - dchuk
         | 
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
            dependencies: 
         | 
| 16 | 
            -
            - !ruby/object:Gem::Dependency 
         | 
| 12 | 
            +
            date: 2014-01-16 00:00:00.000000000 Z
         | 
| 13 | 
            +
            dependencies:
         | 
| 14 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 17 15 | 
             
              name: typhoeus
         | 
| 18 | 
            -
               | 
| 19 | 
            -
              requirement: &id001 !ruby/object:Gem::Requirement 
         | 
| 16 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 20 17 | 
             
                none: false
         | 
| 21 | 
            -
                requirements: | 
| 22 | 
            -
                - -  | 
| 23 | 
            -
                  - !ruby/object:Gem::Version | 
| 18 | 
            +
                requirements:
         | 
| 19 | 
            +
                - - '='
         | 
| 20 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 24 21 | 
             
                    version: 0.3.2
         | 
| 25 22 | 
             
              type: :runtime
         | 
| 26 | 
            -
              version_requirements: *id001
         | 
| 27 | 
            -
            - !ruby/object:Gem::Dependency 
         | 
| 28 | 
            -
              name: bloomfilter-rb
         | 
| 29 23 | 
             
              prerelease: false
         | 
| 30 | 
            -
               | 
| 24 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 25 | 
            +
                none: false
         | 
| 26 | 
            +
                requirements:
         | 
| 27 | 
            +
                - - '='
         | 
| 28 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 29 | 
            +
                    version: 0.3.2
         | 
| 30 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 31 | 
            +
              name: bloomfilter-rb
         | 
| 32 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 31 33 | 
             
                none: false
         | 
| 32 | 
            -
                requirements: | 
| 33 | 
            -
                - -  | 
| 34 | 
            -
                  - !ruby/object:Gem::Version | 
| 34 | 
            +
                requirements:
         | 
| 35 | 
            +
                - - '='
         | 
| 36 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 35 37 | 
             
                    version: 2.1.1
         | 
| 36 38 | 
             
              type: :runtime
         | 
| 37 | 
            -
              version_requirements: *id002
         | 
| 38 | 
            -
            - !ruby/object:Gem::Dependency 
         | 
| 39 | 
            -
              name: nokogiri
         | 
| 40 39 | 
             
              prerelease: false
         | 
| 41 | 
            -
               | 
| 40 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 41 | 
            +
                none: false
         | 
| 42 | 
            +
                requirements:
         | 
| 43 | 
            +
                - - '='
         | 
| 44 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 45 | 
            +
                    version: 2.1.1
         | 
| 46 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 47 | 
            +
              name: nokogiri
         | 
| 48 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 42 49 | 
             
                none: false
         | 
| 43 | 
            -
                requirements: | 
| 44 | 
            -
                - -  | 
| 45 | 
            -
                  - !ruby/object:Gem::Version | 
| 50 | 
            +
                requirements:
         | 
| 51 | 
            +
                - - '='
         | 
| 52 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 46 53 | 
             
                    version: 1.5.0
         | 
| 47 54 | 
             
              type: :runtime
         | 
| 48 | 
            -
              version_requirements: *id003
         | 
| 49 | 
            -
            - !ruby/object:Gem::Dependency 
         | 
| 50 | 
            -
              name: domainatrix
         | 
| 51 55 | 
             
              prerelease: false
         | 
| 52 | 
            -
               | 
| 56 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 57 | 
            +
                none: false
         | 
| 58 | 
            +
                requirements:
         | 
| 59 | 
            +
                - - '='
         | 
| 60 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 61 | 
            +
                    version: 1.5.0
         | 
| 62 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 63 | 
            +
              name: domainatrix
         | 
| 64 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 53 65 | 
             
                none: false
         | 
| 54 | 
            -
                requirements: | 
| 55 | 
            -
                - -  | 
| 56 | 
            -
                  - !ruby/object:Gem::Version | 
| 66 | 
            +
                requirements:
         | 
| 67 | 
            +
                - - '='
         | 
| 68 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 57 69 | 
             
                    version: 0.0.10
         | 
| 58 70 | 
             
              type: :runtime
         | 
| 59 | 
            -
               | 
| 60 | 
            -
             | 
| 71 | 
            +
              prerelease: false
         | 
| 72 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 73 | 
            +
                none: false
         | 
| 74 | 
            +
                requirements:
         | 
| 75 | 
            +
                - - '='
         | 
| 76 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 77 | 
            +
                    version: 0.0.10
         | 
| 78 | 
            +
            description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
         | 
| 79 | 
            +
              store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
         | 
| 80 | 
            +
              every page on a domain.
         | 
| 61 81 | 
             
            email: me@dchuk.com
         | 
| 62 82 | 
             
            executables: []
         | 
| 63 | 
            -
             | 
| 64 83 | 
             
            extensions: []
         | 
| 65 | 
            -
             | 
| 66 84 | 
             
            extra_rdoc_files: []
         | 
| 67 | 
            -
             | 
| 68 | 
            -
            files: 
         | 
| 85 | 
            +
            files:
         | 
| 69 86 | 
             
            - lib/arachnid.rb
         | 
| 70 | 
            -
            has_rdoc: true
         | 
| 71 87 | 
             
            homepage: https://github.com/dchuk/Arachnid
         | 
| 72 88 | 
             
            licenses: []
         | 
| 73 | 
            -
             | 
| 74 89 | 
             
            post_install_message: 
         | 
| 75 90 | 
             
            rdoc_options: []
         | 
| 76 | 
            -
             | 
| 77 | 
            -
            require_paths: 
         | 
| 91 | 
            +
            require_paths:
         | 
| 78 92 | 
             
            - lib
         | 
| 79 | 
            -
            required_ruby_version: !ruby/object:Gem::Requirement | 
| 93 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 80 94 | 
             
              none: false
         | 
| 81 | 
            -
              requirements: | 
| 82 | 
            -
              - -  | 
| 83 | 
            -
                - !ruby/object:Gem::Version | 
| 84 | 
            -
                  version:  | 
| 85 | 
            -
            required_rubygems_version: !ruby/object:Gem::Requirement | 
| 95 | 
            +
              requirements:
         | 
| 96 | 
            +
              - - ! '>='
         | 
| 97 | 
            +
                - !ruby/object:Gem::Version
         | 
| 98 | 
            +
                  version: '0'
         | 
| 99 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 86 100 | 
             
              none: false
         | 
| 87 | 
            -
              requirements: | 
| 88 | 
            -
              - -  | 
| 89 | 
            -
                - !ruby/object:Gem::Version | 
| 90 | 
            -
                  version:  | 
| 101 | 
            +
              requirements:
         | 
| 102 | 
            +
              - - ! '>='
         | 
| 103 | 
            +
                - !ruby/object:Gem::Version
         | 
| 104 | 
            +
                  version: '0'
         | 
| 91 105 | 
             
            requirements: []
         | 
| 92 | 
            -
             | 
| 93 106 | 
             
            rubyforge_project: 
         | 
| 94 | 
            -
            rubygems_version: 1. | 
| 107 | 
            +
            rubygems_version: 1.8.23
         | 
| 95 108 | 
             
            signing_key: 
         | 
| 96 109 | 
             
            specification_version: 3
         | 
| 97 110 | 
             
            summary: Extremely fast and efficient domain spider
         | 
| 98 111 | 
             
            test_files: []
         | 
| 99 | 
            -
             |