webinspector 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/web_inspector/inspector.rb +86 -9
- data/lib/web_inspector/page.rb +14 -1
- data/lib/web_inspector/request.rb +15 -1
- data/lib/web_inspector/version.rb +1 -1
- data/webinspector.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9168c8258b2cc38cad1e30e12d1f42c07d2e0ce
|
4
|
+
data.tar.gz: 302adea791b1d4a4afd03a3fa36e5220244a9896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d43248b9c86fb8da996fa874a8ae3202ce9b033cc0d38c015ece9d55b65576a5e55a778929a8afa950feb3db4a51d2637dbbbf2ca6a1de5c460f433ccb1356a5
|
7
|
+
data.tar.gz: c5364539ff2f5701f01feff1d931bef49e27abeeeba5cbfc959c1d932a91e3e8276482cd7b7ed7b64d2f3c8ad83d2f174d91c830e462d840657ea670e490f89a
|
data/README.md
CHANGED
@@ -62,6 +62,10 @@ page.meta['description'] # meta description
|
|
62
62
|
page.meta['keywords'] # meta keywords
|
63
63
|
```
|
64
64
|
|
65
|
+
## Contributors
|
66
|
+
|
67
|
+
* Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
|
68
|
+
* Sam Nissen ([@samnissen](https://github.com/samnissen))
|
65
69
|
|
66
70
|
## License
|
67
71
|
The webinspector GEM is released under the MIT License.
|
@@ -25,22 +25,99 @@ module WebInspector
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def links
|
28
|
-
|
29
|
-
@
|
30
|
-
|
28
|
+
get_new_links unless @links
|
29
|
+
return @links
|
30
|
+
end
|
31
|
+
|
32
|
+
def domain_links(user_domain, host)
|
33
|
+
@host ||= host
|
34
|
+
|
35
|
+
validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
|
36
|
+
raise "Invalid domain provided" unless validated_domain_uri
|
37
|
+
|
38
|
+
domain = validated_domain_uri.domain
|
39
|
+
|
40
|
+
domain_links = []
|
41
|
+
|
42
|
+
links.each do |l|
|
43
|
+
|
44
|
+
u = validate_url_domain(l)
|
45
|
+
next unless u && u.domain
|
46
|
+
|
47
|
+
domain_links.push(l) if domain == u.domain.downcase
|
48
|
+
end
|
49
|
+
|
50
|
+
return domain_links.compact
|
51
|
+
end
|
52
|
+
|
53
|
+
def domain_images(user_domain, host)
|
54
|
+
@host ||= host
|
55
|
+
|
56
|
+
validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
|
57
|
+
raise "Invalid domain provided" unless validated_domain_uri
|
58
|
+
|
59
|
+
domain = validated_domain_uri.domain
|
60
|
+
|
61
|
+
domain_images = []
|
62
|
+
|
63
|
+
images.each do |img|
|
64
|
+
u = validate_url_domain(img)
|
65
|
+
next unless u && u.domain
|
66
|
+
|
67
|
+
domain_images.push(img) if u.domain.downcase.end_with?(domain)
|
68
|
+
end
|
69
|
+
|
70
|
+
return domain_images.compact
|
71
|
+
end
|
72
|
+
|
73
|
+
# Normalize and validate the URLs on the page for comparison
|
74
|
+
def validate_url_domain(u)
|
75
|
+
# Enforce a few bare standards before proceeding
|
76
|
+
u = "#{u}"
|
77
|
+
u = "/" if u.empty?
|
78
|
+
|
79
|
+
begin
|
80
|
+
# Look for evidence of a host. If this is a relative link
|
81
|
+
# like '/contact', add the page host.
|
82
|
+
domained_url = @host + u unless (u.split("/").first || "").match(/(\:|\.)/)
|
83
|
+
domained_url ||= u
|
84
|
+
|
85
|
+
# http the URL if it is missing
|
86
|
+
httpped_url = "http://" + domained_url unless domained_url[0..3] == 'http'
|
87
|
+
httpped_url ||= domained_url
|
88
|
+
|
89
|
+
# Make sure the URL parses
|
90
|
+
uri = URI.parse(httpped_url)
|
91
|
+
|
92
|
+
# Make sure the URL passes ICANN rules.
|
93
|
+
# The PublicSuffix object splits the domain and subdomain
|
94
|
+
# (unlike URI), which allows more liberal URL matching.
|
95
|
+
return PublicSuffix.parse(uri.host)
|
96
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
|
97
|
+
return false
|
31
98
|
end
|
32
|
-
return links
|
33
99
|
end
|
34
100
|
|
35
101
|
def images
|
36
|
-
|
37
|
-
@
|
38
|
-
images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
|
39
|
-
end
|
40
|
-
return images
|
102
|
+
get_new_images unless @images
|
103
|
+
return @images
|
41
104
|
end
|
42
105
|
|
43
106
|
private
|
107
|
+
|
108
|
+
def get_new_images
|
109
|
+
@images = []
|
110
|
+
@page.css("img").each do |img|
|
111
|
+
@images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_new_links
|
116
|
+
@links = []
|
117
|
+
@page.css("a").each do |a|
|
118
|
+
@links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
|
119
|
+
end
|
120
|
+
end
|
44
121
|
|
45
122
|
def snippet
|
46
123
|
first_long_paragraph = @page.search('//p[string-length() >= 120]').first
|
data/lib/web_inspector/page.rb
CHANGED
@@ -3,13 +3,14 @@ require 'uri'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'open_uri_redirections'
|
5
5
|
require 'faraday'
|
6
|
+
require 'public_suffix'
|
6
7
|
|
7
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
|
8
9
|
require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
|
9
10
|
|
10
11
|
module WebInspector
|
11
12
|
class Page
|
12
|
-
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :images, :response
|
13
|
+
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
|
13
14
|
|
14
15
|
def initialize(url, options = {})
|
15
16
|
@url = url
|
@@ -50,6 +51,10 @@ module WebInspector
|
|
50
51
|
@request.host
|
51
52
|
end
|
52
53
|
|
54
|
+
def domain
|
55
|
+
@request.domain
|
56
|
+
end
|
57
|
+
|
53
58
|
def scheme
|
54
59
|
@request.scheme
|
55
60
|
end
|
@@ -58,6 +63,14 @@ module WebInspector
|
|
58
63
|
@request.port
|
59
64
|
end
|
60
65
|
|
66
|
+
def domain_links(u = domain)
|
67
|
+
@inspector.domain_links(u, host)
|
68
|
+
end
|
69
|
+
|
70
|
+
def domain_images(u = domain)
|
71
|
+
@inspector.domain_images(u, host)
|
72
|
+
end
|
73
|
+
|
61
74
|
def to_hash
|
62
75
|
{
|
63
76
|
'url' => url,
|
@@ -13,6 +13,10 @@ module WebInspector
|
|
13
13
|
def host
|
14
14
|
uri.host
|
15
15
|
end
|
16
|
+
|
17
|
+
def domain
|
18
|
+
suffix_domain
|
19
|
+
end
|
16
20
|
|
17
21
|
def scheme
|
18
22
|
uri.scheme
|
@@ -23,7 +27,17 @@ module WebInspector
|
|
23
27
|
end
|
24
28
|
|
25
29
|
private
|
26
|
-
|
30
|
+
|
31
|
+
def suffix_domain
|
32
|
+
return @domain if @domain
|
33
|
+
|
34
|
+
begin
|
35
|
+
@domain = PublicSuffix.parse(host).domain
|
36
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
|
37
|
+
@domain = ''
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
27
41
|
def uri
|
28
42
|
Addressable::URI.parse(@url)
|
29
43
|
rescue Addressable::URI::InvalidURIError => e
|
data/webinspector.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webinspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Davide Santangelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,6 +164,20 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: public_suffix
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
167
181
|
description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
|
168
182
|
returns you its meta, links, images and more.
|
169
183
|
email:
|