webinspector 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/web_inspector/inspector.rb +86 -9
- data/lib/web_inspector/page.rb +14 -1
- data/lib/web_inspector/request.rb +15 -1
- data/lib/web_inspector/version.rb +1 -1
- data/webinspector.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9168c8258b2cc38cad1e30e12d1f42c07d2e0ce
|
4
|
+
data.tar.gz: 302adea791b1d4a4afd03a3fa36e5220244a9896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d43248b9c86fb8da996fa874a8ae3202ce9b033cc0d38c015ece9d55b65576a5e55a778929a8afa950feb3db4a51d2637dbbbf2ca6a1de5c460f433ccb1356a5
|
7
|
+
data.tar.gz: c5364539ff2f5701f01feff1d931bef49e27abeeeba5cbfc959c1d932a91e3e8276482cd7b7ed7b64d2f3c8ad83d2f174d91c830e462d840657ea670e490f89a
|
data/README.md
CHANGED
@@ -62,6 +62,10 @@ page.meta['description'] # meta description
|
|
62
62
|
page.meta['keywords'] # meta keywords
|
63
63
|
```
|
64
64
|
|
65
|
+
## Contributors
|
66
|
+
|
67
|
+
* Steven Shelby ([@stevenshelby](https://github.com/stevenshelby))
|
68
|
+
* Sam Nissen ([@samnissen](https://github.com/samnissen))
|
65
69
|
|
66
70
|
## License
|
67
71
|
The webinspector GEM is released under the MIT License.
|
@@ -25,22 +25,99 @@ module WebInspector
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def links
|
28
|
-
|
29
|
-
@
|
30
|
-
|
28
|
+
get_new_links unless @links
|
29
|
+
return @links
|
30
|
+
end
|
31
|
+
|
32
|
+
def domain_links(user_domain, host)
|
33
|
+
@host ||= host
|
34
|
+
|
35
|
+
validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
|
36
|
+
raise "Invalid domain provided" unless validated_domain_uri
|
37
|
+
|
38
|
+
domain = validated_domain_uri.domain
|
39
|
+
|
40
|
+
domain_links = []
|
41
|
+
|
42
|
+
links.each do |l|
|
43
|
+
|
44
|
+
u = validate_url_domain(l)
|
45
|
+
next unless u && u.domain
|
46
|
+
|
47
|
+
domain_links.push(l) if domain == u.domain.downcase
|
48
|
+
end
|
49
|
+
|
50
|
+
return domain_links.compact
|
51
|
+
end
|
52
|
+
|
53
|
+
def domain_images(user_domain, host)
|
54
|
+
@host ||= host
|
55
|
+
|
56
|
+
validated_domain_uri = validate_url_domain("http://#{user_domain.downcase.gsub(/\s+/, '')}")
|
57
|
+
raise "Invalid domain provided" unless validated_domain_uri
|
58
|
+
|
59
|
+
domain = validated_domain_uri.domain
|
60
|
+
|
61
|
+
domain_images = []
|
62
|
+
|
63
|
+
images.each do |img|
|
64
|
+
u = validate_url_domain(img)
|
65
|
+
next unless u && u.domain
|
66
|
+
|
67
|
+
domain_images.push(img) if u.domain.downcase.end_with?(domain)
|
68
|
+
end
|
69
|
+
|
70
|
+
return domain_images.compact
|
71
|
+
end
|
72
|
+
|
73
|
+
# Normalize and validate the URLs on the page for comparison
|
74
|
+
def validate_url_domain(u)
|
75
|
+
# Enforce a few bare standards before proceeding
|
76
|
+
u = "#{u}"
|
77
|
+
u = "/" if u.empty?
|
78
|
+
|
79
|
+
begin
|
80
|
+
# Look for evidence of a host. If this is a relative link
|
81
|
+
# like '/contact', add the page host.
|
82
|
+
domained_url = @host + u unless (u.split("/").first || "").match(/(\:|\.)/)
|
83
|
+
domained_url ||= u
|
84
|
+
|
85
|
+
# http the URL if it is missing
|
86
|
+
httpped_url = "http://" + domained_url unless domained_url[0..3] == 'http'
|
87
|
+
httpped_url ||= domained_url
|
88
|
+
|
89
|
+
# Make sure the URL parses
|
90
|
+
uri = URI.parse(httpped_url)
|
91
|
+
|
92
|
+
# Make sure the URL passes ICANN rules.
|
93
|
+
# The PublicSuffix object splits the domain and subdomain
|
94
|
+
# (unlike URI), which allows more liberal URL matching.
|
95
|
+
return PublicSuffix.parse(uri.host)
|
96
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
|
97
|
+
return false
|
31
98
|
end
|
32
|
-
return links
|
33
99
|
end
|
34
100
|
|
35
101
|
def images
|
36
|
-
|
37
|
-
@
|
38
|
-
images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
|
39
|
-
end
|
40
|
-
return images
|
102
|
+
get_new_images unless @images
|
103
|
+
return @images
|
41
104
|
end
|
42
105
|
|
43
106
|
private
|
107
|
+
|
108
|
+
def get_new_images
|
109
|
+
@images = []
|
110
|
+
@page.css("img").each do |img|
|
111
|
+
@images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_new_links
|
116
|
+
@links = []
|
117
|
+
@page.css("a").each do |a|
|
118
|
+
@links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
|
119
|
+
end
|
120
|
+
end
|
44
121
|
|
45
122
|
def snippet
|
46
123
|
first_long_paragraph = @page.search('//p[string-length() >= 120]').first
|
data/lib/web_inspector/page.rb
CHANGED
@@ -3,13 +3,14 @@ require 'uri'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'open_uri_redirections'
|
5
5
|
require 'faraday'
|
6
|
+
require 'public_suffix'
|
6
7
|
|
7
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
|
8
9
|
require File.expand_path(File.join(File.dirname(__FILE__), 'request'))
|
9
10
|
|
10
11
|
module WebInspector
|
11
12
|
class Page
|
12
|
-
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :images, :response
|
13
|
+
attr_reader :url, :scheme, :host, :port, :title, :description, :body, :meta, :links, :domain_links, :domain_images, :images, :response
|
13
14
|
|
14
15
|
def initialize(url, options = {})
|
15
16
|
@url = url
|
@@ -50,6 +51,10 @@ module WebInspector
|
|
50
51
|
@request.host
|
51
52
|
end
|
52
53
|
|
54
|
+
def domain
|
55
|
+
@request.domain
|
56
|
+
end
|
57
|
+
|
53
58
|
def scheme
|
54
59
|
@request.scheme
|
55
60
|
end
|
@@ -58,6 +63,14 @@ module WebInspector
|
|
58
63
|
@request.port
|
59
64
|
end
|
60
65
|
|
66
|
+
def domain_links(u = domain)
|
67
|
+
@inspector.domain_links(u, host)
|
68
|
+
end
|
69
|
+
|
70
|
+
def domain_images(u = domain)
|
71
|
+
@inspector.domain_images(u, host)
|
72
|
+
end
|
73
|
+
|
61
74
|
def to_hash
|
62
75
|
{
|
63
76
|
'url' => url,
|
@@ -13,6 +13,10 @@ module WebInspector
|
|
13
13
|
def host
|
14
14
|
uri.host
|
15
15
|
end
|
16
|
+
|
17
|
+
def domain
|
18
|
+
suffix_domain
|
19
|
+
end
|
16
20
|
|
17
21
|
def scheme
|
18
22
|
uri.scheme
|
@@ -23,7 +27,17 @@ module WebInspector
|
|
23
27
|
end
|
24
28
|
|
25
29
|
private
|
26
|
-
|
30
|
+
|
31
|
+
def suffix_domain
|
32
|
+
return @domain if @domain
|
33
|
+
|
34
|
+
begin
|
35
|
+
@domain = PublicSuffix.parse(host).domain
|
36
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
|
37
|
+
@domain = ''
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
27
41
|
def uri
|
28
42
|
Addressable::URI.parse(@url)
|
29
43
|
rescue Addressable::URI::InvalidURIError => e
|
data/webinspector.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webinspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Davide Santangelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,6 +164,20 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: public_suffix
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
167
181
|
description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
|
168
182
|
returns you its meta, links, images and more.
|
169
183
|
email:
|