webinspector 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9b9b2aa7ef567e5a1c663ddfd817b254df35089
4
- data.tar.gz: 1afd5d7b87361ac1dad0b5e50f17237db5ca0471
3
+ metadata.gz: c9168c8258b2cc38cad1e30e12d1f42c07d2e0ce
4
+ data.tar.gz: 302adea791b1d4a4afd03a3fa36e5220244a9896
5
5
  SHA512:
6
- metadata.gz: 32d88f52f97682b37a3024c444b6eb31fe8c7178483bfa0c1ea51070133c16e357b3440cb223623b28159cac8d73f2f93a0b4d12b62d5ac5e116676db9cade91
7
- data.tar.gz: 1f543d568098f33c3d27b186b13f78bea1d40e4fd1b9cce939186d52d502edc4b7b6d23123a512cc276e226ac35fb3efddd743e57679284a1505a7a5945d3bab
6
+ metadata.gz: d43248b9c86fb8da996fa874a8ae3202ce9b033cc0d38c015ece9d55b65576a5e55a778929a8afa950feb3db4a51d2637dbbbf2ca6a1de5c460f433ccb1356a5
7
+ data.tar.gz: c5364539ff2f5701f01feff1d931bef49e27abeeeba5cbfc959c1d932a91e3e8276482cd7b7ed7b64d2f3c8ad83d2f174d91c830e462d840657ea670e490f89a
data/README.md CHANGED
@@ -62,6 +62,10 @@ page.meta['description'] # meta description
62
62
  page.meta['keywords'] # meta keywords
63
63
  ```
64
64
 
65
+ ## Contributors
66
+
67
+ * Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
68
+ * Sam Nissen ([@samnissen](https://github.com/samnissen))
65
69
 
66
70
  ## License
67
71
  The webinspector GEM is released under the MIT License.
@@ -25,22 +25,99 @@ module WebInspector
25
25
  end
26
26
 
27
27
  def links
28
- links = []
29
- @page.css("a").each do |a|
30
- links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
28
+ get_new_links unless @links
29
+ return @links
30
+ end
31
+
32
+ def domain_links(user_domain, host)
33
+ @host ||= host
34
+
35
+ validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
36
+ raise "Invalid domain provided" unless validated_domain_uri
37
+
38
+ domain = validated_domain_uri.domain
39
+
40
+ domain_links = []
41
+
42
+ links.each do |l|
43
+
44
+ u = validate_url_domain(l)
45
+ next unless u && u.domain
46
+
47
+ domain_links.push(l) if domain == u.domain.downcase
48
+ end
49
+
50
+ return domain_links.compact
51
+ end
52
+
53
+ def domain_images(user_domain, host)
54
+ @host ||= host
55
+
56
+ validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
57
+ raise "Invalid domain provided" unless validated_domain_uri
58
+
59
+ domain = validated_domain_uri.domain
60
+
61
+ domain_images = []
62
+
63
+ images.each do |img|
64
+ u = validate_url_domain(img)
65
+ next unless u && u.domain
66
+
67
+ domain_images.push(img) if u.domain.downcase.end_with?(domain)
68
+ end
69
+
70
+ return domain_images.compact
71
+ end
72
+
73
+ # Normalize and validate the URLs on the page for comparison
74
+ def validate_url_domain(u)
75
+ # Enforce a few bare standards before proceeding
76
+ u = "#{u}"
77
+ u = "/" if u.empty?
78
+
79
+ begin
80
+ # Look for evidence of a host. If this is a relative link
81
+ # like '/contact', add the page host.
82
+ domained_url = @host + u unless (u.split("/").first || "").match(/(\:|\.)/)
83
+ domained_url ||= u
84
+
85
+ # http the URL if it is missing
86
+ httpped_url = "http://" + domained_url unless domained_url[0..3] == 'http'
87
+ httpped_url ||= domained_url
88
+
89
+ # Make sure the URL parses
90
+ uri = URI.parse(httpped_url)
91
+
92
+ # Make sure the URL passes ICANN rules.
93
+ # The PublicSuffix object splits the domain and subdomain
94
+ # (unlike URI), which allows more liberal URL matching.
95
+ return PublicSuffix.parse(uri.host)
96
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
97
+ return false
31
98
  end
32
- return links
33
99
  end
34
100
 
35
101
  def images
36
- images = []
37
- @page.css("img").each do |img|
38
- images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
39
- end
40
- return images
102
+ get_new_images unless @images
103
+ return @images
41
104
  end
42
105
 
43
106
  private
107
+
108
+ def get_new_images
109
+ @images = []
110
+ @page.css("img").each do |img|
111
+ @images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
112
+ end
113
+ end
114
+
115
+ def get_new_links
116
+ @links = []
117
+ @page.css("a").each do |a|
118
+ @links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
119
+ end
120
+ end
44
121
 
45
122
  def snippet
46
123
  first_long_paragraph = @page.search('//p[string-length() >= 120]').first
@@ -3,13 +3,14 @@ require 'uri'
3
3
  require 'open-uri'
4
4
  require 'open_uri_redirections'
5
5
  require 'faraday'
6
+ require 'public_suffix'
6
7
 
7
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
8
9
  require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
9
10
 
10
11
  module WebInspector
11
12
  class Page
12
- attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :images, :response
13
+ attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
13
14
 
14
15
  def initialize(url, options = {})
15
16
  @url = url
@@ -50,6 +51,10 @@ module WebInspector
50
51
  @request.host
51
52
  end
52
53
 
54
+ def domain
55
+ @request.domain
56
+ end
57
+
53
58
  def scheme
54
59
  @request.scheme
55
60
  end
@@ -58,6 +63,14 @@ module WebInspector
58
63
  @request.port
59
64
  end
60
65
 
66
+ def domain_links(u = domain)
67
+ @inspector.domain_links(u, host)
68
+ end
69
+
70
+ def domain_images(u = domain)
71
+ @inspector.domain_images(u, host)
72
+ end
73
+
61
74
  def to_hash
62
75
  {
63
76
  'url' => url,
@@ -13,6 +13,10 @@ module WebInspector
13
13
  def host
14
14
  uri.host
15
15
  end
16
+
17
+ def domain
18
+ suffix_domain
19
+ end
16
20
 
17
21
  def scheme
18
22
  uri.scheme
@@ -23,7 +27,17 @@ module WebInspector
23
27
  end
24
28
 
25
29
  private
26
-
30
+
31
+ def suffix_domain
32
+ return @domain if @domain
33
+
34
+ begin
35
+ @domain = PublicSuffix.parse(host).domain
36
+ rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
37
+ @domain = ''
38
+ end
39
+ end
40
+
27
41
  def uri
28
42
  Addressable::URI.parse(@url)
29
43
  rescue Addressable::URI::InvalidURIError => e
@@ -1,3 +1,3 @@
1
1
  module WebInspector
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -34,4 +34,5 @@ Gem::Specification.new do |spec|
34
34
  spec.add_dependency "nokogiri"
35
35
  spec.add_dependency "open_uri_redirections"
36
36
  spec.add_dependency "openurl"
37
+ spec.add_dependency "public_suffix"
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webinspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Santangelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-04 00:00:00.000000000 Z
11
+ date: 2015-06-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -164,6 +164,20 @@ dependencies:
164
164
  - - ">="
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: public_suffix
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
167
181
  description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
168
182
  returns you its meta, links, images and more.
169
183
  email: