webinspector 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9b9b2aa7ef567e5a1c663ddfd817b254df35089
4
- data.tar.gz: 1afd5d7b87361ac1dad0b5e50f17237db5ca0471
3
+ metadata.gz: c9168c8258b2cc38cad1e30e12d1f42c07d2e0ce
4
+ data.tar.gz: 302adea791b1d4a4afd03a3fa36e5220244a9896
5
5
  SHA512:
6
- metadata.gz: 32d88f52f97682b37a3024c444b6eb31fe8c7178483bfa0c1ea51070133c16e357b3440cb223623b28159cac8d73f2f93a0b4d12b62d5ac5e116676db9cade91
7
- data.tar.gz: 1f543d568098f33c3d27b186b13f78bea1d40e4fd1b9cce939186d52d502edc4b7b6d23123a512cc276e226ac35fb3efddd743e57679284a1505a7a5945d3bab
6
+ metadata.gz: d43248b9c86fb8da996fa874a8ae3202ce9b033cc0d38c015ece9d55b65576a5e55a778929a8afa950feb3db4a51d2637dbbbf2ca6a1de5c460f433ccb1356a5
7
+ data.tar.gz: c5364539ff2f5701f01feff1d931bef49e27abeeeba5cbfc959c1d932a91e3e8276482cd7b7ed7b64d2f3c8ad83d2f174d91c830e462d840657ea670e490f89a
data/README.md CHANGED
@@ -62,6 +62,10 @@ page.meta['description'] # meta description
62
62
  page.meta['keywords'] # meta keywords
63
63
  ```
64
64
 
65
+ ## Contributors
66
+
67
+ * Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
68
+ * Sam Nissen ([@samnissen](https://github.com/samnissen))
65
69
 
66
70
  ## License
67
71
  The webinspector GEM is released under the MIT License.
@@ -25,22 +25,99 @@ module WebInspector
25
25
  end
26
26
 
27
27
  def links
28
- links = []
29
- @page.css("a").each do |a|
30
- links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
28
+ get_new_links unless @links
29
+ return @links
30
+ end
31
+
32
+ def domain_links(user_domain, host)
33
+ @host ||= host
34
+
35
+ validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
36
+ raise "Invalid domain provided" unless validated_domain_uri
37
+
38
+ domain = validated_domain_uri.domain
39
+
40
+ domain_links = []
41
+
42
+ links.each do |l|
43
+
44
+ u = validate_url_domain(l)
45
+ next unless u && u.domain
46
+
47
+ domain_links.push(l) if domain == u.domain.downcase
48
+ end
49
+
50
+ return domain_links.compact
51
+ end
52
+
53
+ def domain_images(user_domain, host)
54
+ @host ||= host
55
+
56
+ validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
57
+ raise "Invalid domain provided" unless validated_domain_uri
58
+
59
+ domain = validated_domain_uri.domain
60
+
61
+ domain_images = []
62
+
63
+ images.each do |img|
64
+ u = validate_url_domain(img)
65
+ next unless u && u.domain
66
+
67
+ domain_images.push(img) if u.domain.downcase.end_with?(domain)
68
+ end
69
+
70
+ return domain_images.compact
71
+ end
72
+
73
+ # Normalize and validate the URLs on the page for comparison
74
+ def validate_url_domain(u)
75
+ # Enforce a few bare standards before proceeding
76
+ u = "#{u}"
77
+ u = "/" if u.empty?
78
+
79
+ begin
80
+ # Look for evidence of a host. If this is a relative link
81
+ # like '/contact', add the page host.
82
+ domained_url = @host + u unless (u.split("/").first || "").match(/(\:|\.)/)
83
+ domained_url ||= u
84
+
85
+ # http the URL if it is missing
86
+ httpped_url = "http://" + domained_url unless domained_url[0..3] == 'http'
87
+ httpped_url ||= domained_url
88
+
89
+ # Make sure the URL parses
90
+ uri = URI.parse(httpped_url)
91
+
92
+ # Make sure the URL passes ICANN rules.
93
+ # The PublicSuffix object splits the domain and subdomain
94
+ # (unlike URI), which allows more liberal URL matching.
95
+ return PublicSuffix.parse(uri.host)
96
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
97
+ return false
31
98
  end
32
- return links
33
99
  end
34
100
 
35
101
  def images
36
- images = []
37
- @page.css("img").each do |img|
38
- images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
39
- end
40
- return images
102
+ get_new_images unless @images
103
+ return @images
41
104
  end
42
105
 
43
106
  private
107
+
108
+ def get_new_images
109
+ @images = []
110
+ @page.css("img").each do |img|
111
+ @images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
112
+ end
113
+ end
114
+
115
+ def get_new_links
116
+ @links = []
117
+ @page.css("a").each do |a|
118
+ @links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
119
+ end
120
+ end
44
121
 
45
122
  def snippet
46
123
  first_long_paragraph = @page.search('//p[string-length() >= 120]').first
@@ -3,13 +3,14 @@ require 'uri'
3
3
  require 'open-uri'
4
4
  require 'open_uri_redirections'
5
5
  require 'faraday'
6
+ require 'public_suffix'
6
7
 
7
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
8
9
  require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
9
10
 
10
11
  module WebInspector
11
12
  class Page
12
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :images, :response
13
+ attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
13
14
 
14
15
  def initialize(url, options = {})
15
16
  @url = url
@@ -50,6 +51,10 @@ module WebInspector
50
51
  @request.host
51
52
  end
52
53
 
54
+ def domain
55
+ @request.domain
56
+ end
57
+
53
58
  def scheme
54
59
  @request.scheme
55
60
  end
@@ -58,6 +63,14 @@ module WebInspector
58
63
  @request.port
59
64
  end
60
65
 
66
+ def domain_links(u = domain)
67
+ @inspector.domain_links(u, host)
68
+ end
69
+
70
+ def domain_images(u = domain)
71
+ @inspector.domain_images(u, host)
72
+ end
73
+
61
74
  def to_hash
62
75
  {
63
76
  'url' => url,
@@ -13,6 +13,10 @@ module WebInspector
13
13
  def host
14
14
  uri.host
15
15
  end
16
+
17
+ def domain
18
+ suffix_domain
19
+ end
16
20
 
17
21
  def scheme
18
22
  uri.scheme
@@ -23,7 +27,17 @@ module WebInspector
23
27
  end
24
28
 
25
29
  private
26
-
30
+
31
+ def suffix_domain
32
+ return @domain if @domain
33
+
34
+ begin
35
+ @domain = PublicSuffix.parse(host).domain
36
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
37
+ @domain = ''
38
+ end
39
+ end
40
+
27
41
  def uri
28
42
  Addressable::URI.parse(@url)
29
43
  rescue Addressable::URI::InvalidURIError => e
@@ -1,3 +1,3 @@
1
1
  module WebInspector
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
34
34
  spec.add_dependency "nokogiri"
35
35
  spec.add_dependency "open_uri_redirections"
36
36
  spec.add_dependency "openurl"
37
+ spec.add_dependency "public_suffix"
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webinspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Santangelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-04 00:00:00.000000000 Z
11
+ date: 2015-06-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -164,6 +164,20 @@ dependencies:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: public_suffix
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
167
181
  description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
168
182
  returns you its meta, links, images and more.
169
183
  email: