metainspector 3.3.0 → 4.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Guardfile +5 -0
- data/README.md +26 -8
- data/lib/meta_inspector/document.rb +4 -8
- data/lib/meta_inspector/exception_log.rb +0 -2
- data/lib/meta_inspector/exceptionable.rb +0 -2
- data/lib/meta_inspector/parser.rb +17 -162
- data/lib/meta_inspector/parsers/base.rb +30 -0
- data/lib/meta_inspector/parsers/images.rb +45 -0
- data/lib/meta_inspector/parsers/links.rb +69 -0
- data/lib/meta_inspector/parsers/meta_tags.rb +72 -0
- data/lib/meta_inspector/parsers/texts.rb +27 -0
- data/lib/meta_inspector/request.rb +0 -2
- data/lib/meta_inspector/url.rb +0 -2
- data/lib/meta_inspector/version.rb +1 -3
- data/lib/meta_inspector.rb +5 -2
- data/lib/metainspector.rb +0 -2
- data/meta_inspector.gemspec +2 -1
- data/spec/document_spec.rb +16 -26
- data/spec/exception_log_spec.rb +1 -3
- data/spec/fixtures/example.response +17 -0
- data/spec/meta_inspector/images_spec.rb +111 -0
- data/spec/meta_inspector/links_spec.rb +203 -0
- data/spec/{meta_inspector_spec.rb → meta_inspector/meta_inspector_spec.rb} +1 -3
- data/spec/meta_inspector/meta_tags_spec.rb +108 -0
- data/spec/meta_inspector/redirections_spec.rb +48 -0
- data/spec/meta_inspector/texts_spec.rb +22 -0
- data/spec/parser_spec.rb +7 -393
- data/spec/request_spec.rb +1 -3
- data/spec/spec_helper.rb +0 -2
- data/spec/url_spec.rb +1 -3
- metadata +44 -6
- data/spec/redirections_spec.rb +0 -47
data/spec/document_spec.rb
CHANGED
@@ -1,19 +1,15 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), "/spec_helper")
|
1
|
+
require 'spec_helper'
|
4
2
|
|
5
3
|
describe MetaInspector::Document do
|
6
4
|
describe 'passing the contents of the document as html' do
|
7
|
-
|
8
|
-
@m = MetaInspector::Document.new('http://cnn.com/', :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
9
|
-
end
|
5
|
+
let(:doc) { MetaInspector::Document.new('http://cnn.com/', :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>") }
|
10
6
|
|
11
7
|
it "should get correct links when the url html is passed as an option" do
|
12
|
-
|
8
|
+
doc.links.internal.should == ["http://cnn.com/hello"]
|
13
9
|
end
|
14
10
|
|
15
11
|
it "should get the title" do
|
16
|
-
|
12
|
+
doc.title.should == "Hello From Passed Html"
|
17
13
|
end
|
18
14
|
end
|
19
15
|
|
@@ -22,27 +18,21 @@ describe MetaInspector::Document do
|
|
22
18
|
end
|
23
19
|
|
24
20
|
it "should return a Hash with all the values set" do
|
25
|
-
|
26
|
-
|
21
|
+
doc = MetaInspector::Document.new('http://pagerankalert.com')
|
22
|
+
doc.to_hash.should == {
|
27
23
|
"url" => "http://pagerankalert.com/",
|
28
24
|
"title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
25
|
"favicon" => "http://pagerankalert.com/src/favicon.ico",
|
30
|
-
"links" =>
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
"http://pagerankalert.com/users/sign_up",
|
41
|
-
"http://pagerankalert.com/users/sign_in"],
|
42
|
-
"external_links" => ["mailto:pagerankalert@gmail.com",
|
43
|
-
"http://pagerankalert.posterous.com/",
|
44
|
-
"http://twitter.com/pagerankalert",
|
45
|
-
"http://twitter.com/share"],
|
26
|
+
"links" => {
|
27
|
+
'internal' => ["http://pagerankalert.com/",
|
28
|
+
"http://pagerankalert.com/es?language=es",
|
29
|
+
"http://pagerankalert.com/users/sign_up",
|
30
|
+
"http://pagerankalert.com/users/sign_in"],
|
31
|
+
'external' => ["http://pagerankalert.posterous.com/",
|
32
|
+
"http://twitter.com/pagerankalert",
|
33
|
+
"http://twitter.com/share"],
|
34
|
+
'non_http' => ["mailto:pagerankalert@gmail.com"]
|
35
|
+
},
|
46
36
|
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
47
37
|
"charset" => "utf-8",
|
48
38
|
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
data/spec/exception_log_spec.rb
CHANGED
@@ -22,5 +22,22 @@ Via: 1.1 varnish
|
|
22
22
|
<rect x="10" y="10" width="200" height="50" style="fill:none; stroke:blue; stroke-width:1px"/>
|
23
23
|
</g>
|
24
24
|
</svg>
|
25
|
+
|
26
|
+
<!-- Internal relative links -->
|
27
|
+
<a href="/">Root</a>
|
28
|
+
<a href="/faqs">FAQs</a>
|
29
|
+
<a href="contact">Contact</a>
|
30
|
+
|
31
|
+
<!-- Internal absolute links -->
|
32
|
+
<a href="http://example.com/team.html">Team</a>
|
33
|
+
|
34
|
+
<!-- External links -->
|
35
|
+
<a href="https://twitter.com">Twitter</a>
|
36
|
+
<a href="https://github.com">Github</a>
|
37
|
+
|
38
|
+
<!-- Non-HTTP links -->
|
39
|
+
<a href="mailto:hello@example.com">email</a>
|
40
|
+
<a href="javascript:alert('hi');">hello</a>
|
41
|
+
<a href="ftp://ftp.example.com">FTP</a>
|
25
42
|
</body>
|
26
43
|
</html>
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe MetaInspector do
|
4
|
+
|
5
|
+
describe "#images" do
|
6
|
+
describe "returns an Enumerable" do
|
7
|
+
let(:page) { MetaInspector.new('https://twitter.com/markupvalidator') }
|
8
|
+
|
9
|
+
it "has a length" do
|
10
|
+
page.images.length.should == 6
|
11
|
+
end
|
12
|
+
|
13
|
+
it "has a size" do
|
14
|
+
page.images.size.should == 6
|
15
|
+
end
|
16
|
+
|
17
|
+
it "can be iterated" do
|
18
|
+
c = []
|
19
|
+
page.images.each {|i| c << i}
|
20
|
+
c.length.should == 6
|
21
|
+
end
|
22
|
+
|
23
|
+
it "can be sorted" do
|
24
|
+
page.images.sort
|
25
|
+
.should == ["https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png",
|
26
|
+
"https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif",
|
27
|
+
"https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg",
|
28
|
+
"https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png",
|
29
|
+
"https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png",
|
30
|
+
"https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png"]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should find all page images" do
|
35
|
+
page = MetaInspector.new('http://pagerankalert.com')
|
36
|
+
|
37
|
+
page.images.to_a.should == ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"]
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should find images on twitter" do
|
41
|
+
page = MetaInspector.new('https://twitter.com/markupvalidator')
|
42
|
+
|
43
|
+
page.images.length.should == 6
|
44
|
+
page.images.to_a.should == ["https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_reasonably_small.png",
|
45
|
+
"https://twimg0-a.akamaihd.net/profile_images/2380086215/fcu46ozay5f5al9kdfvq_normal.png",
|
46
|
+
"https://twimg0-a.akamaihd.net/profile_images/2293774732/v0pgo4xpdd9rou2xq5h0_normal.png",
|
47
|
+
"https://twimg0-a.akamaihd.net/profile_images/1538528659/jaime_nov_08_normal.jpg",
|
48
|
+
"https://si0.twimg.com/sticky/default_profile_images/default_profile_6_mini.png",
|
49
|
+
"https://twimg0-a.akamaihd.net/a/1342841381/images/bigger_spinner.gif"]
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should ignore malformed image tags" do
|
53
|
+
# There is an image tag without a source. The scraper should not fatal.
|
54
|
+
page = MetaInspector.new("http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups")
|
55
|
+
|
56
|
+
page.images.size.should == 11
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#image" do
|
61
|
+
it "should find the og image" do
|
62
|
+
page = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
63
|
+
|
64
|
+
page.images.best.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should find image on youtube" do
|
68
|
+
page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
69
|
+
|
70
|
+
page.images.best.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should find image when og:image and twitter:image metatags are missing" do
|
74
|
+
page = MetaInspector.new('http://www.alazan.com')
|
75
|
+
|
76
|
+
page.images.best.should == "http://www.alazan.com/imagenes/logo.jpg"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe '#favicon' do
|
81
|
+
it "should get favicon link when marked as icon" do
|
82
|
+
page = MetaInspector.new('http://pagerankalert.com/')
|
83
|
+
|
84
|
+
page.images.favicon.should == 'http://pagerankalert.com/src/favicon.ico'
|
85
|
+
end
|
86
|
+
|
87
|
+
it "should get favicon link when marked as shortcut" do
|
88
|
+
page = MetaInspector.new('http://pagerankalert-shortcut.com/')
|
89
|
+
|
90
|
+
page.images.favicon.should == 'http://pagerankalert-shortcut.com/src/favicon.ico'
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should get favicon link when marked as shorcut and icon" do
|
94
|
+
page = MetaInspector.new('http://pagerankalert-shortcut-and-icon.com/')
|
95
|
+
|
96
|
+
page.images.favicon.should == 'http://pagerankalert-shortcut-and-icon.com/src/favicon.ico'
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should get favicon link when there is also a touch icon" do
|
100
|
+
page = MetaInspector.new('http://pagerankalert-touch-icon.com/')
|
101
|
+
|
102
|
+
page.images.favicon.should == 'http://pagerankalert-touch-icon.com/src/favicon.ico'
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should get favicon link of nil" do
|
106
|
+
page = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
107
|
+
|
108
|
+
page.images.favicon.should == nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,203 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe MetaInspector do
|
4
|
+
let(:page) { MetaInspector.new('http://example.com') }
|
5
|
+
|
6
|
+
describe '#links' do
|
7
|
+
it 'returns the internal links' do
|
8
|
+
page.links.internal.should == [ "http://example.com/",
|
9
|
+
"http://example.com/faqs",
|
10
|
+
"http://example.com/contact",
|
11
|
+
"http://example.com/team.html" ]
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'returns the external links' do
|
15
|
+
page.links.external.should == [ "https://twitter.com/",
|
16
|
+
"https://github.com/" ]
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'returns the non-HTTP links' do
|
20
|
+
page.links.non_http.should == [ "mailto:hello@example.com",
|
21
|
+
"javascript:alert('hi');",
|
22
|
+
"ftp://ftp.example.com/" ]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe 'Links' do
|
27
|
+
before(:each) do
|
28
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should get correct absolute links for internal pages" do
|
32
|
+
@m.links.internal.should == [ "http://pagerankalert.com/",
|
33
|
+
"http://pagerankalert.com/es?language=es",
|
34
|
+
"http://pagerankalert.com/users/sign_up",
|
35
|
+
"http://pagerankalert.com/users/sign_in" ]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should get correct absolute links for external pages" do
|
39
|
+
@m.links.external.should == [ "http://pagerankalert.posterous.com/",
|
40
|
+
"http://twitter.com/pagerankalert",
|
41
|
+
"http://twitter.com/share" ]
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should get correct absolute links, correcting relative links from URL not ending with slash" do
|
45
|
+
m = MetaInspector.new('http://alazan.com/websolution.asp')
|
46
|
+
|
47
|
+
m.links.internal.should == [ "http://alazan.com/index.asp",
|
48
|
+
"http://alazan.com/faqs.asp" ]
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "links with international characters" do
|
52
|
+
it "should get correct absolute links, encoding the URLs as needed" do
|
53
|
+
m = MetaInspector.new('http://international.com')
|
54
|
+
|
55
|
+
m.links.internal.should == [ "http://international.com/espa%C3%B1a.asp",
|
56
|
+
"http://international.com/roman%C3%A9e",
|
57
|
+
"http://international.com/faqs#cami%C3%B3n",
|
58
|
+
"http://international.com/search?q=cami%C3%B3n",
|
59
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
60
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
61
|
+
|
62
|
+
m.links.external.should == [ "http://example.com/espa%C3%B1a.asp",
|
63
|
+
"http://example.com/roman%C3%A9e",
|
64
|
+
"http://example.com/faqs#cami%C3%B3n",
|
65
|
+
"http://example.com/search?q=cami%C3%B3n",
|
66
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "internal links" do
|
70
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
71
|
+
m = MetaInspector.new('http://international.com')
|
72
|
+
m.links.internal.should == [ "http://international.com/espa%C3%B1a.asp",
|
73
|
+
"http://international.com/roman%C3%A9e",
|
74
|
+
"http://international.com/faqs#cami%C3%B3n",
|
75
|
+
"http://international.com/search?q=cami%C3%B3n",
|
76
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
77
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should not crash when processing malformed hrefs" do
|
81
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
82
|
+
m.links.internal.should == [ "http://example.com/faqs" ]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "external links" do
|
87
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
88
|
+
m = MetaInspector.new('http://international.com')
|
89
|
+
m.links.external.should == [ "http://example.com/espa%C3%B1a.asp",
|
90
|
+
"http://example.com/roman%C3%A9e",
|
91
|
+
"http://example.com/faqs#cami%C3%B3n",
|
92
|
+
"http://example.com/search?q=cami%C3%B3n",
|
93
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should not crash when processing malformed hrefs" do
|
97
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
98
|
+
m.links.non_http.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com", "javascript:alert('ok');",
|
99
|
+
"javascript://", "mailto:email(at)example.com"]
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should not crash with links that have weird href values" do
|
105
|
+
m = MetaInspector.new('http://example.com/invalid_href')
|
106
|
+
m.links.non_http.should == ["%3Cp%3Eftp://ftp.cdrom.com", "skype:joeuser?call", "telnet://telnet.cdrom.com"]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe 'Relative links' do
|
111
|
+
describe 'From a root URL' do
|
112
|
+
before(:each) do
|
113
|
+
@m = MetaInspector.new('http://relative.com/')
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'should get the relative links' do
|
117
|
+
@m.links.internal.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
describe 'From a document' do
|
122
|
+
before(:each) do
|
123
|
+
@m = MetaInspector.new('http://relative.com/company')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should get the relative links' do
|
127
|
+
@m.links.internal.should == ['http://relative.com/about', 'http://relative.com/sitemap']
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe 'From a directory' do
|
132
|
+
before(:each) do
|
133
|
+
@m = MetaInspector.new('http://relative.com/company/')
|
134
|
+
end
|
135
|
+
|
136
|
+
it 'should get the relative links' do
|
137
|
+
@m.links.internal.should == ['http://relative.com/company/about', 'http://relative.com/sitemap']
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
describe 'Relative links with base' do
|
143
|
+
it 'should get the relative links from a document' do
|
144
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2')
|
145
|
+
m.links.internal.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'should get the relative links from a directory' do
|
149
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2/')
|
150
|
+
m.links.internal.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe 'Non-HTTP links' do
|
155
|
+
before(:each) do
|
156
|
+
@m = MetaInspector.new('http://example.com/nonhttp')
|
157
|
+
end
|
158
|
+
|
159
|
+
it "should get the links" do
|
160
|
+
@m.links.non_http.sort.should == [
|
161
|
+
"ftp://ftp.cdrom.com/",
|
162
|
+
"javascript:alert('hey');",
|
163
|
+
"mailto:user@example.com",
|
164
|
+
"skype:joeuser?call",
|
165
|
+
"telnet://telnet.cdrom.com"
|
166
|
+
]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe 'Protocol-relative URLs' do
|
171
|
+
before(:each) do
|
172
|
+
@m_http = MetaInspector.new('http://protocol-relative.com')
|
173
|
+
@m_https = MetaInspector.new('https://protocol-relative.com')
|
174
|
+
end
|
175
|
+
|
176
|
+
it "should convert protocol-relative links to http" do
|
177
|
+
@m_http.links.internal.should include('http://protocol-relative.com/contact')
|
178
|
+
@m_http.links.external.should include('http://yahoo.com/')
|
179
|
+
end
|
180
|
+
|
181
|
+
it "should convert protocol-relative links to https" do
|
182
|
+
@m_https.links.internal.should include('https://protocol-relative.com/contact')
|
183
|
+
@m_https.links.external.should include('https://yahoo.com/')
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe "Feed" do
|
188
|
+
it "should get rss feed" do
|
189
|
+
@m = MetaInspector.new('http://www.iteh.at')
|
190
|
+
@m.feed.should == 'http://www.iteh.at/de/rss/'
|
191
|
+
end
|
192
|
+
|
193
|
+
it "should get atom feed" do
|
194
|
+
@m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
|
195
|
+
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
196
|
+
end
|
197
|
+
|
198
|
+
it "should return nil if no feed found" do
|
199
|
+
@m = MetaInspector.new('http://www.alazan.com')
|
200
|
+
@m.feed.should == nil
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe MetaInspector do
|
4
|
+
|
5
|
+
describe "meta tags" do
|
6
|
+
let(:page) { MetaInspector.new('http://example.com/meta-tags') }
|
7
|
+
|
8
|
+
it "#meta_tags" do
|
9
|
+
page.meta_tags.should == {
|
10
|
+
'name' => {
|
11
|
+
'keywords' => ['one, two, three'],
|
12
|
+
'description' => ['the description'],
|
13
|
+
'author' => ['Joe Sample'],
|
14
|
+
'robots' => ['index,follow'],
|
15
|
+
'revisit' => ['15 days'],
|
16
|
+
'dc.date.issued' => ['2011-09-15']
|
17
|
+
},
|
18
|
+
|
19
|
+
'http-equiv' => {
|
20
|
+
'content-type' => ['text/html; charset=UTF-8'],
|
21
|
+
'content-style-type' => ['text/css']
|
22
|
+
},
|
23
|
+
|
24
|
+
'property' => {
|
25
|
+
'og:title' => ['An OG title'],
|
26
|
+
'og:type' => ['website'],
|
27
|
+
'og:url' => ['http://example.com/meta-tags'],
|
28
|
+
'og:image' => ['http://example.com/rock.jpg',
|
29
|
+
'http://example.com/rock2.jpg',
|
30
|
+
'http://example.com/rock3.jpg'],
|
31
|
+
'og:image:width' => ['300'],
|
32
|
+
'og:image:height' => ['300', '1000']
|
33
|
+
},
|
34
|
+
|
35
|
+
'charset' => ['UTF-8']
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
it "#meta_tag" do
|
40
|
+
page.meta_tag.should == {
|
41
|
+
'name' => {
|
42
|
+
'keywords' => 'one, two, three',
|
43
|
+
'description' => 'the description',
|
44
|
+
'author' => 'Joe Sample',
|
45
|
+
'robots' => 'index,follow',
|
46
|
+
'revisit' => '15 days',
|
47
|
+
'dc.date.issued' => '2011-09-15'
|
48
|
+
},
|
49
|
+
|
50
|
+
'http-equiv' => {
|
51
|
+
'content-type' => 'text/html; charset=UTF-8',
|
52
|
+
'content-style-type' => 'text/css'
|
53
|
+
},
|
54
|
+
|
55
|
+
'property' => {
|
56
|
+
'og:title' => 'An OG title',
|
57
|
+
'og:type' => 'website',
|
58
|
+
'og:url' => 'http://example.com/meta-tags',
|
59
|
+
'og:image' => 'http://example.com/rock.jpg',
|
60
|
+
'og:image:width' => '300',
|
61
|
+
'og:image:height' => '300'
|
62
|
+
},
|
63
|
+
|
64
|
+
'charset' => 'UTF-8'
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
it "#meta" do
|
69
|
+
page.meta.should == {
|
70
|
+
'keywords' => 'one, two, three',
|
71
|
+
'description' => 'the description',
|
72
|
+
'author' => 'Joe Sample',
|
73
|
+
'robots' => 'index,follow',
|
74
|
+
'revisit' => '15 days',
|
75
|
+
'dc.date.issued' => '2011-09-15',
|
76
|
+
'content-type' => 'text/html; charset=UTF-8',
|
77
|
+
'content-style-type' => 'text/css',
|
78
|
+
'og:title' => 'An OG title',
|
79
|
+
'og:type' => 'website',
|
80
|
+
'og:url' => 'http://example.com/meta-tags',
|
81
|
+
'og:image' => 'http://example.com/rock.jpg',
|
82
|
+
'og:image:width' => '300',
|
83
|
+
'og:image:height' => '300',
|
84
|
+
'charset' => 'UTF-8'
|
85
|
+
}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe 'Charset detection' do
|
90
|
+
it "should get the charset from <meta charset />" do
|
91
|
+
page = MetaInspector.new('http://charset001.com')
|
92
|
+
|
93
|
+
page.charset.should == "utf-8"
|
94
|
+
end
|
95
|
+
|
96
|
+
it "should get the charset from meta content type" do
|
97
|
+
page = MetaInspector.new('http://charset002.com')
|
98
|
+
|
99
|
+
page.charset.should == "windows-1252"
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should get nil if no declared charset is found" do
|
103
|
+
page = MetaInspector.new('http://charset000.com')
|
104
|
+
|
105
|
+
page.charset.should == nil
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe MetaInspector do
|
4
|
+
describe "redirections" do
|
5
|
+
let(:logger) { MetaInspector::ExceptionLog.new }
|
6
|
+
|
7
|
+
context "when redirections are turned off" do
|
8
|
+
it "disallows redirections" do
|
9
|
+
page = MetaInspector.new("http://facebook.com", :allow_redirections => false)
|
10
|
+
|
11
|
+
page.url.should == "http://facebook.com/"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
context "when redirections are on (default)" do
|
16
|
+
it "allows follows redirections" do
|
17
|
+
logger.should_not receive(:<<)
|
18
|
+
|
19
|
+
page = MetaInspector.new("http://facebook.com", exception_log: logger)
|
20
|
+
|
21
|
+
page.url.should == "https://www.facebook.com/"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "when there are cookies required for proper redirection" do
|
26
|
+
before(:all) { WebMock.enable! }
|
27
|
+
after(:all) { WebMock.disable! }
|
28
|
+
|
29
|
+
it "allows follows redirections while sending the cookies" do
|
30
|
+
stub_request(:get, "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/")
|
31
|
+
.to_return(:status => 302,
|
32
|
+
:headers => {
|
33
|
+
"Location" => "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1",
|
34
|
+
"Set-Cookie" => "EMETA_COOKIE_CHECK=1; path=/; domain=clarionledger.com"
|
35
|
+
})
|
36
|
+
|
37
|
+
stub_request(:get, "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1")
|
38
|
+
.with(:headers => {"Cookie" => "EMETA_COOKIE_CHECK=1"})
|
39
|
+
|
40
|
+
logger.should_not receive(:<<)
|
41
|
+
|
42
|
+
page = MetaInspector.new("http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/", exception_log: logger)
|
43
|
+
|
44
|
+
page.url.should == "http://blogs.clarionledger.com/dechols/2014/03/24/digital-medicine/?nclick_check=1"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe MetaInspector do
|
4
|
+
it "should get the title from the head section" do
|
5
|
+
page = MetaInspector.new('http://example.com')
|
6
|
+
|
7
|
+
page.title.should == 'An example page'
|
8
|
+
end
|
9
|
+
|
10
|
+
describe '#description' do
|
11
|
+
it "should find description from meta description" do
|
12
|
+
page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
13
|
+
|
14
|
+
page.description.should == "This is Youtube"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should find a secondary description if no meta description" do
|
18
|
+
page = MetaInspector.new('http://theonion-no-description.com')
|
19
|
+
page.description.should == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday, an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|