RubyGems - webinspector - Versions diffs - 0.3.1 → 0.4.0 - Mend

webinspector 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +4 -0
data/lib/web_inspector/inspector.rb +86 -9
data/lib/web_inspector/page.rb +14 -1
data/lib/web_inspector/request.rb +15 -1
data/lib/web_inspector/version.rb +1 -1
data/webinspector.gemspec +1 -0
metadata +16 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d9b9b2aa7ef567e5a1c663ddfd817b254df35089
-  data.tar.gz: 1afd5d7b87361ac1dad0b5e50f17237db5ca0471
+  metadata.gz: c9168c8258b2cc38cad1e30e12d1f42c07d2e0ce
+  data.tar.gz: 302adea791b1d4a4afd03a3fa36e5220244a9896
 SHA512:
-  metadata.gz: 32d88f52f97682b37a3024c444b6eb31fe8c7178483bfa0c1ea51070133c16e357b3440cb223623b28159cac8d73f2f93a0b4d12b62d5ac5e116676db9cade91
-  data.tar.gz: 1f543d568098f33c3d27b186b13f78bea1d40e4fd1b9cce939186d52d502edc4b7b6d23123a512cc276e226ac35fb3efddd743e57679284a1505a7a5945d3bab
+  metadata.gz: d43248b9c86fb8da996fa874a8ae3202ce9b033cc0d38c015ece9d55b65576a5e55a778929a8afa950feb3db4a51d2637dbbbf2ca6a1de5c460f433ccb1356a5
+  data.tar.gz: c5364539ff2f5701f01feff1d931bef49e27abeeeba5cbfc959c1d932a91e3e8276482cd7b7ed7b64d2f3c8ad83d2f174d91c830e462d840657ea670e490f89a

data/README.md CHANGED

@@ -62,6 +62,10 @@ page.meta['description']  # meta description
 page.meta['keywords']      # meta keywords
 ```
+## Contributors
+  * Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
+	* Sam Nissen ([@samnissen](https://github.com/samnissen))
 ## License
 The webinspector GEM is released under the MIT License.

data/lib/web_inspector/inspector.rb CHANGED

@@ -25,22 +25,99 @@ module WebInspector
     end
     def links
-      links = []
-      @page.css("a").each do |a|
-        links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
+      get_new_links unless @links
+      return @links
+    end
+    def domain_links(user_domain, host)
+      @host ||= host
+      validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
+      raise "Invalid domain provided" unless validated_domain_uri
+      domain = validated_domain_uri.domain
+      domain_links = []
+      links.each do |l|
+        u = validate_url_domain(l)
+        next unless u && u.domain
+        domain_links.push(l) if domain == u.domain.downcase
+      end
+      return domain_links.compact
+    end
+    def domain_images(user_domain, host)
+      @host ||= host
+      validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
+      raise "Invalid domain provided" unless validated_domain_uri
+      domain = validated_domain_uri.domain
+      domain_images = []
+      images.each do |img|
+        u = validate_url_domain(img)
+        next unless u && u.domain
+        domain_images.push(img) if u.domain.downcase.end_with?(domain)
+      end
+      return domain_images.compact
+    end
+    # Normalize and validate the URLs on the page for comparison
+    def validate_url_domain(u)
+      # Enforce a few bare standards before proceeding
+      u = "#{u}"
+      u = "/" if u.empty?
+      begin
+        # Look for evidence of a host. If this is a relative link
+        # like '/contact', add the page host.
+        domained_url   = @host + u unless (u.split("/").first || "").match(/(\:|\.)/)
+        domained_url ||= u
+        # http the URL if it is missing
+        httpped_url   = "http://" + domained_url unless domained_url[0..3] == 'http'
+        httpped_url ||= domained_url
+        # Make sure the URL parses
+        uri     = URI.parse(httpped_url)
+        # Make sure the URL passes ICANN rules.
+        # The PublicSuffix object splits the domain and subdomain
+        # (unlike URI), which allows more liberal URL matching.
+        return PublicSuffix.parse(uri.host)
+      rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
+        return false
       end
-      return links
     end
     def images
-      images = []
-      @page.css("img").each do |img|
-        images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
-      end
-      return images
+      get_new_images unless @images
+      return @images
     end
     private
+    def get_new_images
+      @images = []
+      @page.css("img").each do |img|
+        @images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
+      end
+    end
+    def get_new_links
+      @links = []
+      @page.css("a").each do |a|
+        @links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
+      end
+    end
     def snippet
       first_long_paragraph = @page.search('//p[string-length() >= 120]').first

data/lib/web_inspector/page.rb CHANGED

@@ -3,13 +3,14 @@ require 'uri'
 require 'open-uri'
 require 'open_uri_redirections'
 require 'faraday'
+require 'public_suffix'
 require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
 require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
 module WebInspector
   class Page
-    attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :images, :response
+    attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
     def initialize(url, options = {})
       @url = url
@@ -50,6 +51,10 @@ module WebInspector
       @request.host
     end
+    def domain
+      @request.domain
+    end
     def scheme
       @request.scheme
     end
@@ -58,6 +63,14 @@ module WebInspector
       @request.port
     end
+    def domain_links(u = domain)
+      @inspector.domain_links(u, host)
+    end
+    def domain_images(u = domain)
+      @inspector.domain_images(u, host)
+    end
     def to_hash
       {
         'url'           => url,

data/lib/web_inspector/request.rb CHANGED

@@ -13,6 +13,10 @@ module WebInspector
     def host
       uri.host
     end
+    def domain
+      suffix_domain
+    end
     def scheme
       uri.scheme
@@ -23,7 +27,17 @@ module WebInspector
     end
     private
+    def suffix_domain
+      return @domain if @domain
+      begin
+        @domain = PublicSuffix.parse(host).domain
+      rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
+        @domain = ''
+      end
+    end
     def uri
       Addressable::URI.parse(@url)
     rescue Addressable::URI::InvalidURIError => e

data/lib/web_inspector/version.rb CHANGED

@@ -1,3 +1,3 @@
 module WebInspector
-  VERSION = "0.3.1"
+  VERSION = "0.4.0"
 end

data/webinspector.gemspec CHANGED

@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
   spec.add_dependency "nokogiri"
   spec.add_dependency "open_uri_redirections"
   spec.add_dependency "openurl"
+  spec.add_dependency "public_suffix"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: webinspector
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.4.0
 platform: ruby
 authors:
 - Davide Santangelo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2015-06-04 00:00:00.000000000 Z
+date: 2015-06-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -164,6 +164,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: public_suffix
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
   returns you its meta, links, images and more.
 email: