metainspector 1.16.1 → 1.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/meta_inspector.rb +10 -3
- data/lib/meta_inspector/deprecations.rb +19 -0
- data/lib/meta_inspector/document.rb +81 -0
- data/lib/meta_inspector/exception_log.rb +29 -0
- data/lib/meta_inspector/exceptionable.rb +11 -0
- data/lib/meta_inspector/parser.rb +178 -0
- data/lib/meta_inspector/request.rb +55 -0
- data/lib/meta_inspector/url.rb +76 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +97 -0
- data/spec/exception_log_spec.rb +59 -0
- data/spec/meta_inspector_spec.rb +9 -0
- data/spec/parser_spec.rb +374 -0
- data/spec/redirections_spec.rb +20 -3
- data/spec/request_spec.rb +64 -0
- data/spec/url_spec.rb +74 -0
- metadata +18 -7
- data/lib/meta_inspector/scraper.rb +0 -283
- data/spec/metainspector_spec.rb +0 -547
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'addressable/uri'
|
4
|
+
|
5
|
+
module MetaInspector
|
6
|
+
class URL
|
7
|
+
attr_reader :url
|
8
|
+
|
9
|
+
include MetaInspector::Exceptionable
|
10
|
+
|
11
|
+
def initialize(initial_url, options = {})
|
12
|
+
options = defaults.merge(options)
|
13
|
+
@exception_log = options[:exception_log]
|
14
|
+
|
15
|
+
self.url = initial_url
|
16
|
+
end
|
17
|
+
|
18
|
+
def scheme
|
19
|
+
parsed(url) ? parsed(url).scheme : nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def host
|
23
|
+
parsed(url) ? parsed(url).host : nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def root_url
|
27
|
+
"#{scheme}://#{host}/"
|
28
|
+
end
|
29
|
+
|
30
|
+
def url=(new_url)
|
31
|
+
@url = normalized(with_default_scheme(new_url))
|
32
|
+
end
|
33
|
+
|
34
|
+
# Converts a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
35
|
+
def self.unrelativize(url, scheme)
|
36
|
+
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
37
|
+
end
|
38
|
+
|
39
|
+
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
40
|
+
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
41
|
+
def self.absolutify(url, base_url)
|
42
|
+
if url =~ /^\w*\:/i
|
43
|
+
MetaInspector::URL.new(url).url
|
44
|
+
else
|
45
|
+
Addressable::URI.join(base_url, url).normalize.to_s
|
46
|
+
end
|
47
|
+
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
48
|
+
@exception_log << e
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def defaults
|
55
|
+
{ exception_log: MetaInspector::ExceptionLog.new }
|
56
|
+
end
|
57
|
+
|
58
|
+
# Adds 'http' as default scheme, if there is none
|
59
|
+
def with_default_scheme(url)
|
60
|
+
parsed(url) && parsed(url).scheme.nil? ? 'http://' + url : url
|
61
|
+
end
|
62
|
+
|
63
|
+
# Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
|
64
|
+
def normalized(url)
|
65
|
+
Addressable::URI.parse(url).normalize.to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
def parsed(url)
|
69
|
+
URI.parse(url)
|
70
|
+
|
71
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
72
|
+
@exception_log << e
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector::Document do
|
6
|
+
describe 'passing the contents of the document as html' do
|
7
|
+
before(:each) do
|
8
|
+
@m = MetaInspector::Document.new('http://cnn.com/', :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should get correct links when the url html is passed as an option" do
|
12
|
+
@m.links.should == ["http://cnn.com/hello"]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should get the title" do
|
16
|
+
@m.title.should == "Hello From Passed Html"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should return a String as to_s" do
|
21
|
+
MetaInspector::Document.new('http://pagerankalert.com').to_s.class.should == String
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return a Hash with all the values set" do
|
25
|
+
@m = MetaInspector::Document.new('http://pagerankalert.com')
|
26
|
+
@m.to_hash.should == {
|
27
|
+
"url" =>"http://pagerankalert.com/",
|
28
|
+
"title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
|
+
"links" => ["http://pagerankalert.com/",
|
30
|
+
"http://pagerankalert.com/es?language=es",
|
31
|
+
"http://pagerankalert.com/users/sign_up",
|
32
|
+
"http://pagerankalert.com/users/sign_in",
|
33
|
+
"mailto:pagerankalert@gmail.com",
|
34
|
+
"http://pagerankalert.posterous.com/",
|
35
|
+
"http://twitter.com/pagerankalert",
|
36
|
+
"http://twitter.com/share"],
|
37
|
+
"internal_links" => ["http://pagerankalert.com/",
|
38
|
+
"http://pagerankalert.com/es?language=es",
|
39
|
+
"http://pagerankalert.com/users/sign_up",
|
40
|
+
"http://pagerankalert.com/users/sign_in"],
|
41
|
+
"external_links" => ["mailto:pagerankalert@gmail.com",
|
42
|
+
"http://pagerankalert.posterous.com/",
|
43
|
+
"http://twitter.com/pagerankalert",
|
44
|
+
"http://twitter.com/share"],
|
45
|
+
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
46
|
+
"charset" => "utf-8",
|
47
|
+
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
48
|
+
"content_type" =>"text/html",
|
49
|
+
"meta" => {
|
50
|
+
"name" => {
|
51
|
+
"description"=> "Track your PageRank(TM) changes and receive alerts by email",
|
52
|
+
"keywords" => "pagerank, seo, optimization, google",
|
53
|
+
"robots" => "all,follow",
|
54
|
+
"csrf_param" => "authenticity_token",
|
55
|
+
"csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
|
56
|
+
},
|
57
|
+
"property"=>{}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'exception handling' do
|
63
|
+
it "should parse images when parse_html_content_type_only is not specified" do
|
64
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
|
65
|
+
desc = image_url.description
|
66
|
+
|
67
|
+
image_url.should be_ok
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should parse images when parse_html_content_type_only is false" do
|
71
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', :html_content_only => false)
|
72
|
+
desc = image_url.description
|
73
|
+
|
74
|
+
image_url.should be_ok
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
78
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', :html_content_only => true)
|
79
|
+
|
80
|
+
expect {
|
81
|
+
title = image_url.title
|
82
|
+
}.to change { image_url.exceptions.size }
|
83
|
+
|
84
|
+
image_url.exceptions.first.message.should == "The url provided contains image/png content instead of text/html content"
|
85
|
+
end
|
86
|
+
|
87
|
+
it "should handle errors when content is not text/html and html_content_type_only is true" do
|
88
|
+
tar_url = MetaInspector::Document.new('http://pagerankalert.com/file.tar.gz', :html_content_only => true)
|
89
|
+
|
90
|
+
expect {
|
91
|
+
title = tar_url.title
|
92
|
+
}.to change { tar_url.exceptions.size }
|
93
|
+
|
94
|
+
tar_url.exceptions.first.message.should == "The url provided contains application/x-gzip content instead of text/html content"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector::ExceptionLog do
|
6
|
+
let(:logger) { MetaInspector::ExceptionLog.new }
|
7
|
+
|
8
|
+
describe "storing exceptions" do
|
9
|
+
it "should store exceptions" do
|
10
|
+
expect {
|
11
|
+
logger << StandardError.new("an error message")
|
12
|
+
}.to change { logger.exceptions.length }.from(0).to(1)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return stored exceptions" do
|
16
|
+
first = StandardError.new("first message")
|
17
|
+
second = StandardError.new("second message")
|
18
|
+
|
19
|
+
logger << first
|
20
|
+
logger << second
|
21
|
+
|
22
|
+
logger.exceptions.should == [first, second]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "ok?" do
|
27
|
+
it "should be true if no exceptions stored" do
|
28
|
+
logger.should be_ok
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should be false if some exception stored" do
|
32
|
+
logger << StandardError.new("some message")
|
33
|
+
logger.should_not be_ok
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "warn_level" do
|
38
|
+
it "should be quiet by default" do
|
39
|
+
MetaInspector::ExceptionLog.new.warn_level.should be_nil
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should warn about the error if warn_level is :warn" do
|
43
|
+
verbose_logger = MetaInspector::ExceptionLog.new(warn_level: :warn)
|
44
|
+
exception = StandardError.new("an error message")
|
45
|
+
|
46
|
+
verbose_logger.should_receive(:warn).with(exception)
|
47
|
+
verbose_logger << exception
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should raise exceptions when warn_level is :raise" do
|
51
|
+
raiser_logger = MetaInspector::ExceptionLog.new(warn_level: :raise)
|
52
|
+
exception = StandardError.new("this should be raised")
|
53
|
+
|
54
|
+
expect {
|
55
|
+
raiser_logger << exception
|
56
|
+
}.to raise_exception(StandardError, "this should be raised")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,374 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector::Parser do
|
6
|
+
describe 'Doing a basic scrape' do
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should get the title" do
|
13
|
+
@m.title.should == 'PageRankAlert.com :: Track your PageRank changes & receive alerts'
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should not find an image" do
|
17
|
+
@m.image.should == nil
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "get image" do
|
21
|
+
it "should find the og image" do
|
22
|
+
@m = MetaInspector::Parser.new(doc 'http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
23
|
+
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
24
|
+
@m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should find image on youtube" do
|
28
|
+
MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc').image.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "get images" do
|
33
|
+
it "should find all page images" do
|
34
|
+
@m.images.should == ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"]
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should find images on twitter" do
|
38
|
+
m = MetaInspector::Parser.new(doc 'https://twitter.com/markupvalidator')
|
39
|
+
m.images.length.should == 6
|
40
|
+
m.images.join("; ").should == "https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png; https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png; https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png; https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg; https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png; https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should ignore malformed image tags" do
|
45
|
+
# There is an image tag without a source. The scraper should not fatal.
|
46
|
+
@m = MetaInspector::Parser.new(doc "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
|
47
|
+
@m.images.size.should == 11
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should have a Nokogiri::HTML::Document as parsed" do
|
51
|
+
@m.parsed.class.should == Nokogiri::HTML::Document
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should return the document as a string" do
|
55
|
+
@m.to_s.class.should == String
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "Feed" do
|
59
|
+
it "should get rss feed" do
|
60
|
+
@m = MetaInspector::Parser.new(doc 'http://www.iteh.at')
|
61
|
+
@m.feed.should == 'http://www.iteh.at/de/rss/'
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should get atom feed" do
|
65
|
+
@m = MetaInspector::Parser.new(doc 'http://www.tea-tron.com/jbravo/blog/')
|
66
|
+
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should return nil if no feed found" do
|
70
|
+
@m = MetaInspector::Parser.new(doc 'http://www.alazan.com')
|
71
|
+
@m.feed.should == nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "get description" do
|
76
|
+
it "should find description on youtube" do
|
77
|
+
MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc').description.should == ""
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe 'Page with missing meta description' do
|
83
|
+
it "should find a secondary description" do
|
84
|
+
@m = MetaInspector::Parser.new(doc 'http://theonion-no-description.com')
|
85
|
+
@m.description.should == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday, an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe 'Links' do
|
90
|
+
before(:each) do
|
91
|
+
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should get the links" do
|
95
|
+
@m.links.should == [ "http://pagerankalert.com/",
|
96
|
+
"http://pagerankalert.com/es?language=es",
|
97
|
+
"http://pagerankalert.com/users/sign_up",
|
98
|
+
"http://pagerankalert.com/users/sign_in",
|
99
|
+
"mailto:pagerankalert@gmail.com",
|
100
|
+
"http://pagerankalert.posterous.com/",
|
101
|
+
"http://twitter.com/pagerankalert",
|
102
|
+
"http://twitter.com/share" ]
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should get correct absolute links for internal pages" do
|
106
|
+
@m.internal_links.should == [ "http://pagerankalert.com/",
|
107
|
+
"http://pagerankalert.com/es?language=es",
|
108
|
+
"http://pagerankalert.com/users/sign_up",
|
109
|
+
"http://pagerankalert.com/users/sign_in" ]
|
110
|
+
end
|
111
|
+
|
112
|
+
it "should get correct absolute links for external pages" do
|
113
|
+
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
114
|
+
"http://pagerankalert.posterous.com/",
|
115
|
+
"http://twitter.com/pagerankalert",
|
116
|
+
"http://twitter.com/share" ]
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
120
|
+
m = MetaInspector::Parser.new(doc 'http://alazan.com/websolution.asp')
|
121
|
+
m.links.should == [ "http://alazan.com/index.asp",
|
122
|
+
"http://alazan.com/faqs.asp" ]
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should return empty array if no links found" do
|
126
|
+
m = MetaInspector::Parser.new(doc 'http://example.com/empty')
|
127
|
+
m.links.should == []
|
128
|
+
end
|
129
|
+
|
130
|
+
describe "links with international characters" do
|
131
|
+
it "should get correct absolute links, encoding the URLs as needed" do
|
132
|
+
m = MetaInspector::Parser.new(doc 'http://international.com')
|
133
|
+
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
134
|
+
"http://international.com/roman%C3%A9e",
|
135
|
+
"http://international.com/faqs#cami%C3%B3n",
|
136
|
+
"http://international.com/search?q=cami%C3%B3n",
|
137
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
138
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
|
139
|
+
"http://example.com/espa%C3%B1a.asp",
|
140
|
+
"http://example.com/roman%C3%A9e",
|
141
|
+
"http://example.com/faqs#cami%C3%B3n",
|
142
|
+
"http://example.com/search?q=cami%C3%B3n",
|
143
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
144
|
+
end
|
145
|
+
|
146
|
+
describe "internal links" do
|
147
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
148
|
+
m = MetaInspector::Parser.new(doc 'http://international.com')
|
149
|
+
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
150
|
+
"http://international.com/roman%C3%A9e",
|
151
|
+
"http://international.com/faqs#cami%C3%B3n",
|
152
|
+
"http://international.com/search?q=cami%C3%B3n",
|
153
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
154
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
155
|
+
end
|
156
|
+
|
157
|
+
it "should not crash when processing malformed hrefs" do
|
158
|
+
m = MetaInspector::Parser.new(doc 'http://example.com/malformed_href')
|
159
|
+
expect {
|
160
|
+
m.internal_links.should == [ "http://example.com/faqs" ]
|
161
|
+
m.should be_ok
|
162
|
+
}.to_not raise_error
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
describe "external links" do
|
167
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
168
|
+
m = MetaInspector::Parser.new(doc 'http://international.com')
|
169
|
+
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
170
|
+
"http://example.com/roman%C3%A9e",
|
171
|
+
"http://example.com/faqs#cami%C3%B3n",
|
172
|
+
"http://example.com/search?q=cami%C3%B3n",
|
173
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
174
|
+
end
|
175
|
+
|
176
|
+
it "should not crash when processing malformed hrefs" do
|
177
|
+
m = MetaInspector::Parser.new(doc 'http://example.com/malformed_href')
|
178
|
+
expect {
|
179
|
+
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
180
|
+
"javascript:alert('ok');", "javascript://", "mailto:email(at)example.com"]
|
181
|
+
m.should be_ok
|
182
|
+
}.to_not raise_error
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
it "should not crash with links that have weird href values" do
|
188
|
+
m = MetaInspector::Parser.new(doc 'http://example.com/invalid_href')
|
189
|
+
m.links.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
describe 'Relative links' do
|
194
|
+
describe 'From a root URL' do
|
195
|
+
before(:each) do
|
196
|
+
@m = MetaInspector::Parser.new(doc 'http://relative.com/')
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'should get the relative links' do
|
200
|
+
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
describe 'From a document' do
|
205
|
+
before(:each) do
|
206
|
+
@m = MetaInspector::Parser.new(doc 'http://relative.com/company')
|
207
|
+
end
|
208
|
+
|
209
|
+
it 'should get the relative links' do
|
210
|
+
@m.internal_links.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
describe 'From a directory' do
|
215
|
+
before(:each) do
|
216
|
+
@m = MetaInspector::Parser.new(doc 'http://relative.com/company/')
|
217
|
+
end
|
218
|
+
|
219
|
+
it 'should get the relative links' do
|
220
|
+
@m.internal_links.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
describe 'Relative links with base' do
|
226
|
+
it 'should get the relative links from a document' do
|
227
|
+
m = MetaInspector::Parser.new(doc 'http://relativewithbase.com/company/page2')
|
228
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
229
|
+
end
|
230
|
+
|
231
|
+
it 'should get the relative links from a directory' do
|
232
|
+
m = MetaInspector::Parser.new(doc 'http://relativewithbase.com/company/page2/')
|
233
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
describe 'Non-HTTP links' do
|
238
|
+
before(:each) do
|
239
|
+
@m = MetaInspector::Parser.new(doc 'http://example.com/nonhttp')
|
240
|
+
end
|
241
|
+
|
242
|
+
it "should get the links" do
|
243
|
+
@m.links.sort.should == [
|
244
|
+
"ftp://ftp.cdrom.com/",
|
245
|
+
"javascript:alert('hey');",
|
246
|
+
"mailto:user@example.com",
|
247
|
+
"skype:joeuser?call",
|
248
|
+
"telnet://telnet.cdrom.com"
|
249
|
+
]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
describe 'Protocol-relative URLs' do
|
254
|
+
before(:each) do
|
255
|
+
@m_http = MetaInspector::Parser.new(doc 'http://protocol-relative.com')
|
256
|
+
@m_https = MetaInspector::Parser.new(doc 'https://protocol-relative.com')
|
257
|
+
end
|
258
|
+
|
259
|
+
it "should convert protocol-relative links to http" do
|
260
|
+
@m_http.links.should include('http://protocol-relative.com/contact')
|
261
|
+
@m_http.links.should include('http://yahoo.com/')
|
262
|
+
end
|
263
|
+
|
264
|
+
it "should convert protocol-relative links to https" do
|
265
|
+
@m_https.links.should include('https://protocol-relative.com/contact')
|
266
|
+
@m_https.links.should include('https://yahoo.com/')
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
describe 'Getting meta tags by ghost methods' do
|
271
|
+
before(:each) do
|
272
|
+
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
273
|
+
end
|
274
|
+
|
275
|
+
it "should get the robots meta tag" do
|
276
|
+
@m.meta_robots.should == 'all,follow'
|
277
|
+
end
|
278
|
+
|
279
|
+
it "should get the robots meta tag" do
|
280
|
+
@m.meta_RoBoTs.should == 'all,follow'
|
281
|
+
end
|
282
|
+
|
283
|
+
it "should get the description meta tag" do
|
284
|
+
@m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
|
285
|
+
end
|
286
|
+
|
287
|
+
it "should get the keywords meta tag" do
|
288
|
+
@m.meta_keywords.should == "pagerank, seo, optimization, google"
|
289
|
+
end
|
290
|
+
|
291
|
+
it "should get the content-language meta tag" do
|
292
|
+
pending "mocks"
|
293
|
+
@m.meta_content_language.should == "en"
|
294
|
+
end
|
295
|
+
|
296
|
+
it "should get the Csrf_pAram meta tag" do
|
297
|
+
@m.meta_Csrf_pAram.should == "authenticity_token"
|
298
|
+
end
|
299
|
+
|
300
|
+
it "should return nil for nonfound meta_tags" do
|
301
|
+
@m.meta_lollypop.should == nil
|
302
|
+
end
|
303
|
+
|
304
|
+
it "should get the generator meta tag" do
|
305
|
+
@m = MetaInspector::Parser.new(doc 'http://www.inkthemes.com/')
|
306
|
+
@m.meta_generator.should == 'WordPress 3.4.2'
|
307
|
+
end
|
308
|
+
|
309
|
+
it "should find a meta_og_title" do
|
310
|
+
@m = MetaInspector::Parser.new(doc 'http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
311
|
+
@m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
|
312
|
+
end
|
313
|
+
|
314
|
+
it "should not find a meta_og_something" do
|
315
|
+
@m = MetaInspector::Parser.new(doc 'http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
316
|
+
@m.meta_og_something.should == nil
|
317
|
+
end
|
318
|
+
|
319
|
+
it "should find a meta_twitter_site" do
|
320
|
+
@m = MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc')
|
321
|
+
@m.meta_twitter_site.should == "@youtube"
|
322
|
+
end
|
323
|
+
|
324
|
+
it "should find a meta_twitter_player_width" do
|
325
|
+
@m = MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc')
|
326
|
+
@m.meta_twitter_player_width.should == "1920"
|
327
|
+
end
|
328
|
+
|
329
|
+
it "should not find a meta_twitter_dummy" do
|
330
|
+
@m = MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc')
|
331
|
+
@m.meta_twitter_dummy.should == nil
|
332
|
+
end
|
333
|
+
|
334
|
+
it "should find a meta_og_video_width" do
|
335
|
+
@m = MetaInspector::Parser.new(doc 'http://www.youtube.com/watch?v=iaGSSrp49uc')
|
336
|
+
@m.meta_og_video_width.should == "1920"
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
describe 'Charset detection' do
|
341
|
+
it "should get the charset from <meta charset />" do
|
342
|
+
@m = MetaInspector::Parser.new(doc 'http://charset001.com')
|
343
|
+
@m.charset.should == "utf-8"
|
344
|
+
end
|
345
|
+
|
346
|
+
it "should get the charset from meta content type" do
|
347
|
+
@m = MetaInspector::Parser.new(doc 'http://charset002.com')
|
348
|
+
@m.charset.should == "windows-1252"
|
349
|
+
end
|
350
|
+
|
351
|
+
it "should get nil if no declared charset is found" do
|
352
|
+
@m = MetaInspector::Parser.new(doc 'http://charset000.com')
|
353
|
+
@m.charset.should == nil
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
describe 'to_hash' do
|
358
|
+
it "should return a hash with all the values set" do
|
359
|
+
@m = MetaInspector::Parser.new(doc 'http://pagerankalert.com')
|
360
|
+
@m.to_hash.should == { "meta" => { "name" => { "description" => "Track your PageRank(TM) changes and receive alerts by email",
|
361
|
+
"keywords" => "pagerank, seo, optimization, google",
|
362
|
+
"robots" => "all,follow",
|
363
|
+
"csrf_param" => "authenticity_token",
|
364
|
+
"csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="},
|
365
|
+
"property"=>{}}}
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
private
|
370
|
+
|
371
|
+
def doc(url, options = {})
|
372
|
+
MetaInspector::Document.new(url, options)
|
373
|
+
end
|
374
|
+
end
|