metainspector 1.9.3 → 1.9.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +4 -4
- data/lib/meta_inspector/scraper.rb +35 -15
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/w3clove_faqs.response +266 -0
- data/spec/metainspector_spec.rb +52 -34
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -32,15 +32,15 @@ Then you can see the scraped data like this:
|
|
32
32
|
|
33
33
|
page.url # URL of the page
|
34
34
|
page.scheme # Scheme of the page (http, https)
|
35
|
+
page.host # Hostname of the page (like, w3clove.com, without the scheme)
|
36
|
+
page.root_url # Root url (scheme + host, like http://w3clove.com/)
|
35
37
|
page.title # title of the page, as string
|
36
|
-
page.links # array of strings, with every link found on the page
|
37
|
-
page.absolute_links # array of all the links converted to absolute urls
|
38
|
+
page.links # array of strings, with every link found on the page as an absolute URL
|
38
39
|
page.meta_description # meta description, as string
|
39
40
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
40
41
|
page.meta_keywords # meta keywords, as string
|
41
42
|
page.image # Most relevant image, if defined with og:image
|
42
|
-
page.images # array of strings, with every img found on the page
|
43
|
-
page.absolute_images # array of all the images converted to absolute urls
|
43
|
+
page.images # array of strings, with every img found on the page as an absolute URL
|
44
44
|
page.feed # Get rss or atom links in meta data fields as array
|
45
45
|
page.meta_og_title # opengraph title
|
46
46
|
page.meta_og_image # opengraph image
|
@@ -9,13 +9,15 @@ require 'timeout'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url, :scheme, :errors
|
12
|
+
attr_reader :url, :scheme, :host, :root_url, :errors
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
15
|
|
16
16
|
def initialize(url, timeout = 20)
|
17
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
18
|
@scheme = URI.parse(url).scheme || 'http'
|
19
|
+
@host = URI.parse(url).host
|
20
|
+
@root_url = "#{@scheme}://#{@host}/"
|
19
21
|
@timeout = timeout
|
20
22
|
@data = Hashie::Rash.new('url' => @url)
|
21
23
|
@errors = []
|
@@ -33,26 +35,24 @@ module MetaInspector
|
|
33
35
|
meta_description.nil? ? secondary_description : meta_description
|
34
36
|
end
|
35
37
|
|
36
|
-
#
|
38
|
+
# Links found on the page, as absolute URLs
|
37
39
|
def links
|
38
|
-
@data.links ||=
|
39
|
-
.map {|link| link.attributes["href"] \
|
40
|
-
.to_s.strip}.uniq rescue nil
|
40
|
+
@data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
46
|
-
.map{ |i| i.attributes['src'].value }.uniq
|
43
|
+
def absolute_links
|
44
|
+
warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
|
45
|
+
links
|
47
46
|
end
|
48
47
|
|
49
|
-
#
|
50
|
-
def
|
51
|
-
@data.
|
48
|
+
# Images found on the page, as absolute URLs
|
49
|
+
def images
|
50
|
+
@data.images ||= parsed_images.map{ |i| absolutify_url(i) }
|
52
51
|
end
|
53
52
|
|
54
53
|
def absolute_images
|
55
|
-
|
54
|
+
warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
|
55
|
+
images
|
56
56
|
end
|
57
57
|
|
58
58
|
# Returns the parsed document meta rss links
|
@@ -81,7 +81,7 @@ module MetaInspector
|
|
81
81
|
# Returns all parsed data as a nested Hash
|
82
82
|
def to_hash
|
83
83
|
# TODO: find a better option to populate the data to the Hash
|
84
|
-
image;feed;links;charset;
|
84
|
+
image;images;feed;links;charset;title;meta_keywords
|
85
85
|
@data.to_hash
|
86
86
|
end
|
87
87
|
|
@@ -146,6 +146,18 @@ module MetaInspector
|
|
146
146
|
|
147
147
|
private
|
148
148
|
|
149
|
+
def parsed_links
|
150
|
+
@parsed_links ||= parsed_document.search("//a") \
|
151
|
+
.map {|link| link.attributes["href"] \
|
152
|
+
.to_s.strip}.uniq rescue nil
|
153
|
+
end
|
154
|
+
|
155
|
+
def parsed_images
|
156
|
+
@parsed_images ||= parsed_document.search('//img') \
|
157
|
+
.reject{|i| i.attributes['src'].blank? } \
|
158
|
+
.map{ |i| i.attributes['src'].value }.uniq
|
159
|
+
end
|
160
|
+
|
149
161
|
# Stores the error for later inspection
|
150
162
|
def add_fatal_error(error)
|
151
163
|
@errors << error
|
@@ -154,7 +166,15 @@ module MetaInspector
|
|
154
166
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
155
167
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
156
168
|
def absolutify_url(url)
|
157
|
-
url =~ /^\w*\:/i
|
169
|
+
if url =~ /^\w*\:/i
|
170
|
+
url
|
171
|
+
else
|
172
|
+
if url[0] == "/"
|
173
|
+
File.join(@root_url, url)
|
174
|
+
else
|
175
|
+
File.join(@url, url)
|
176
|
+
end
|
177
|
+
end
|
158
178
|
end
|
159
179
|
|
160
180
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
@@ -0,0 +1,266 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx
|
3
|
+
Date: Mon, 23 Jul 2012 08:44:12 GMT
|
4
|
+
Content-Type: text/html; charset=utf-8
|
5
|
+
Connection: keep-alive
|
6
|
+
Status: 200 OK
|
7
|
+
X-Ua-Compatible: IE=Edge,chrome=1
|
8
|
+
Etag: "c4f3d4aaf12acce6a909714618e08934"
|
9
|
+
Cache-Control: max-age=0, private, must-revalidate
|
10
|
+
Set-Cookie: _w3clovesite_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFRkkiJTJiMWU0NzVkNjJjNDliMDRlZGI3MjI5OTVlN2U4MjU5BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMTlWUmVSMEVlTWNuV0t4cTFuNHUvQVozZCttMjhxRTEvWFhYYW5hOXRFdUk9BjsARg%3D%3D--7e9d3e900c9531363297f469f8baa3e3ed31336a; path=/; HttpOnly
|
11
|
+
X-Request-Id: 33ca78a4044d244e673d273a59fa4ebc
|
12
|
+
X-Runtime: 0.017688
|
13
|
+
X-Rack-Cache: miss
|
14
|
+
Content-Length: 12923
|
15
|
+
X-Varnish: 647613022
|
16
|
+
Age: 0
|
17
|
+
Via: 1.1 varnish
|
18
|
+
|
19
|
+
<!DOCTYPE html>
|
20
|
+
<html>
|
21
|
+
|
22
|
+
<head><script type="text/javascript">var NREUMQ=NREUMQ||[];NREUMQ.push(["mark","firstbyte",new Date().getTime()]);</script>
|
23
|
+
<meta charset="UTF-8" />
|
24
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
|
25
|
+
<title>Whole site HTML validator | W3CLove</title>
|
26
|
+
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />
|
27
|
+
<meta name="description" content="Site-wide markup validation tool. Validate the markup of your whole site with just one click." />
|
28
|
+
<meta name="keywords" content="html, markup, validation, validator, tool, w3c, development, standards, free" />
|
29
|
+
<link href="http://cdn-images.mailchimp.com/embedcode/slim-081711.css" rel="stylesheet" type="text/css">
|
30
|
+
<link href="http://fonts.googleapis.com/css?family=Terminal+Dosis:400,600" rel="stylesheet" type="text/css" />
|
31
|
+
<link href="/assets/application-9da2f67bc1bc6e19a801cb7685a0b497.css" media="screen" rel="stylesheet" type="text/css" />
|
32
|
+
<meta content="authenticity_token" name="csrf-param" />
|
33
|
+
<meta content="9VReR0EeMcnWKxq1n4u/AZ3d+m28qE1/XXXana9tEuI=" name="csrf-token" />
|
34
|
+
<script src="/assets/application-4e8aa1a929a0aeab6bdf339edecbeaa6.js" type="text/javascript"></script>
|
35
|
+
<script src="/assets/pages-7270767b2a9e9fff880aa5de378ca791.js" type="text/javascript"></script>
|
36
|
+
<script src="https://apis.google.com/js/plusone.js" type="text/javascript"></script>
|
37
|
+
|
38
|
+
<script type="text/javascript">
|
39
|
+
|
40
|
+
var _gaq = _gaq || [];
|
41
|
+
_gaq.push(['_setAccount', 'UA-122379-37']);
|
42
|
+
_gaq.push(['_trackPageview']);
|
43
|
+
|
44
|
+
(function() {
|
45
|
+
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
46
|
+
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
47
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
48
|
+
})();
|
49
|
+
|
50
|
+
</script>
|
51
|
+
</head>
|
52
|
+
|
53
|
+
<body>
|
54
|
+
|
55
|
+
<div id="flash_message"><div><span id="flash_message_text"></span><a class="close-message" href="#" onclick="closeMsgBar();">X</a></div></div>
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
<div class="row top-bar">
|
60
|
+
<div class="content">
|
61
|
+
<header>
|
62
|
+
<div class="header">
|
63
|
+
<h1><a href="/" title="W3CLove">W3CLove</a></h1>
|
64
|
+
<nav>
|
65
|
+
<a href="#" class="nav_button"></a>
|
66
|
+
<ul class="nav">
|
67
|
+
<li><a href="/faqs" class='active'>FAQs</a></li>
|
68
|
+
<li><a href="/plans-and-pricing" >Plans and pricing</a></li>
|
69
|
+
<li><a href="/contact" >Contact</a></li>
|
70
|
+
<li><a href="/charts/errors" >Top 100 Errors</a></li>
|
71
|
+
</ul>
|
72
|
+
</nav>
|
73
|
+
|
74
|
+
|
75
|
+
<div id="sign_in">
|
76
|
+
<a href="/credits" class="label warning" id="credits_count">10 Credits</a>
|
77
|
+
|
78
|
+
<a href="/signin" class="label success">Sign in</a>
|
79
|
+
</div>
|
80
|
+
</div>
|
81
|
+
</header>
|
82
|
+
</div>
|
83
|
+
</div>
|
84
|
+
|
85
|
+
<div id="faqs" class="row hero">
|
86
|
+
|
87
|
+
<div class="content">
|
88
|
+
|
89
|
+
<h2>FAQs</h2>
|
90
|
+
<h3>Frequently Asked Questions</h3>
|
91
|
+
|
92
|
+
</div>
|
93
|
+
|
94
|
+
</div>
|
95
|
+
|
96
|
+
|
97
|
+
<div class="row description">
|
98
|
+
|
99
|
+
<div class="content">
|
100
|
+
|
101
|
+
<h4 id="what_is_w3clove">What is W3CLove?</h4>
|
102
|
+
<p>
|
103
|
+
W3CLove is a site-wide markup validation tool. It allows you to check the validity of the markup of several pages
|
104
|
+
from your website, and gives you a summary of the most common errors and warnings, with a single click.
|
105
|
+
</p>
|
106
|
+
|
107
|
+
<h4 id="why_not_just_use_the_official_w3c_validator">Why not just use the official W3C validator?</h4>
|
108
|
+
<p>
|
109
|
+
The <a href="http://validator.w3.org">official W3C validator</a> does not yet provide a way to submit several URLs at once. So, if you want to check your whole
|
110
|
+
website, you need to submit each of your URLs individually, which is a slow process. W3CLove provides a simpler, faster way to
|
111
|
+
submit several pages at once.
|
112
|
+
</p>
|
113
|
+
|
114
|
+
<h4 id="how_can_i_submit_my_site_for_validation">How can I submit a site for validation?</h4>
|
115
|
+
<p>
|
116
|
+
To submit a site, just enter its URL on the <a href="/">front page</a>, and click the "Validate" button. The W3CLove spider will crawl the site
|
117
|
+
in search for internal links, validate each of them, and then compile all errors and warnings in one summary.
|
118
|
+
</p>
|
119
|
+
|
120
|
+
<h4 id="how_can_i_specify_the_exact_urls_i_want_to_validate">How can I specify the exact URLs I want to validate?</h4>
|
121
|
+
<p>
|
122
|
+
The W3CLove spider will crawl the provided URL in search for internal links, but you can also provide an <a href="http://www.sitemaps.org/">XML sitemap</a> with the exact URLs
|
123
|
+
you need to validate.
|
124
|
+
</p>
|
125
|
+
|
126
|
+
<h4 id="is_there_a_limit_on_the_number_of_urls">Is there a limit on the number of URLs to validate?</h4>
|
127
|
+
<p>
|
128
|
+
Yes, there is a limit of 250 URLs per each sitemap submitted. This should be enough for most sites to get a good idea of the
|
129
|
+
validation status of the site, and saves processing time for both W3CLove and the W3C validator.
|
130
|
+
</p>
|
131
|
+
|
132
|
+
<h4 id="how_can_i_resubmit_a_site">How can I resubmit a site?</h4>
|
133
|
+
<p>
|
134
|
+
Just click on the "Re-check" buttons. You can recheck the whole sitemap or individual pages.
|
135
|
+
</p>
|
136
|
+
|
137
|
+
<h4 id="can_i_store_my_sitemaps_list">Can I store my sitemaps list?</h4>
|
138
|
+
<p>
|
139
|
+
Yes, W3CLove lets you store for free a list of the sitemaps you're interested in validating. Just sign in with your Twitter, Facebook or Google account and every sitemap you validate will appear on your sitemaps list.
|
140
|
+
</p>
|
141
|
+
|
142
|
+
<h4 id="how_do_credits_work">How do credits work?</h4>
|
143
|
+
<p>
|
144
|
+
When you sign up for the first time at W3CLove, you're given 100 initial credits so you can try the service for free.
|
145
|
+
</p>
|
146
|
+
<p>
|
147
|
+
For every single web page validation that you make using our service, you're charged 1 credit. So, for example, if you start with 100 credits and you validate a site that has 30 web pages, you end up with 70 credits.
|
148
|
+
</p>
|
149
|
+
<p>
|
150
|
+
Once you spend all your credits, you can't make more validations until you recharge them.
|
151
|
+
</p>
|
152
|
+
|
153
|
+
<h4 id="how_can_i_recharge_my_credits">How can I recharge my credits?</h4>
|
154
|
+
<p>
|
155
|
+
The easiest way to recharge your credits is through a monthly subscription.
|
156
|
+
</p>
|
157
|
+
<p>
|
158
|
+
This way, your credits will be recharged every month up to the monthly limit of your chosen plan.
|
159
|
+
</p>
|
160
|
+
<p>
|
161
|
+
Check out the <a href="/plans-and-pricing">Plans and pricing</a> page to see what plan is best for you. If you're not sure about how many validations you need, you can buy packs of validations.
|
162
|
+
</p>
|
163
|
+
|
164
|
+
<h4 id="how_can_i_sign_in_with_another_account">I've signed in from one account, how can I sign in with a different one?</h4>
|
165
|
+
<p>
|
166
|
+
For your convenience, you're first shown 3 ways to sign in: Twitter, Facebook and Google. When you use one of those, W3CLove will remember your preference and offer just this one.
|
167
|
+
</p>
|
168
|
+
<p>
|
169
|
+
If you'd like to change this preference, just sign in again with your preferred account:<br/>
|
170
|
+
<a href='#' onclick='window.location="/auth/twitter"; return false;' style='color:white;'><span class='label success'>twitter</span></a>, <a href='#' onclick='window.location="/auth/facebook"; return false;' style='color:white;'><span class='label success'>facebook</span></a> or <a href='#' onclick='window.location="/auth/google_oauth2"; return false;' style='color:white;'><span class='label success'>google</span></a>.
|
171
|
+
</p>
|
172
|
+
|
173
|
+
<h4 id="who_is_behind_all_this">Who is behind all this?</h4>
|
174
|
+
<p>
|
175
|
+
W3CLove is a personal project maintained by <a href="http://jaimeiniesta.com/">Jaime Iniesta</a>, an independent web developer who loves working with Ruby on Rails. That's me. :)
|
176
|
+
</p>
|
177
|
+
|
178
|
+
<h4 id="how_did_this_project_start">How did this project start?</h4>
|
179
|
+
<p>
|
180
|
+
During March 2011 I took the Ruby Core Skills course at the <a href="http://mendicantuniversity.org/">Mendicant University</a>, an intense three week course that takes you through several important topic areas every Ruby developer should be comfortable on. You can <a href="http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby">read more</a> about it at my blog.
|
181
|
+
</p>
|
182
|
+
<p>
|
183
|
+
With the help of <a href="http://majesticseacreature.com/">Gregory Brown</a> and the rest of the <a href="http://school.mendicantuniversity.org/alumni/2011">Mendicant University Alumni</a>, I built the <a href="https://github.com/jaimeiniesta/w3clove">w3clove</a> gem that allows you to do site-wide markup validation from the command line.
|
184
|
+
</p>
|
185
|
+
<p>
|
186
|
+
Afterwards, I built this <a href="http://w3clove.com">W3CLove.com</a> site to make it easier for everyone to do site-wide markup validation, with a nicer HTML interface, storing the results for later, rechecking, etc.
|
187
|
+
</p>
|
188
|
+
<p>
|
189
|
+
I want to express my gratitude to all the Mendicant University community, all of them are still helping me making W3CLove a better tool for everyone. Thank you!
|
190
|
+
</p>
|
191
|
+
|
192
|
+
<h4 id="is_this_free">Is this free?</h4>
|
193
|
+
<p>
|
194
|
+
No, this is a paid service, but you can try it for free.
|
195
|
+
</p>
|
196
|
+
|
197
|
+
<h4 id="is_there_an_open_source_version">Is there an open source version?</h4>
|
198
|
+
<p>
|
199
|
+
Yes! There's a free, standalone version that you can install on your computer. It's packed as a Ruby gem and it's open source, so you can examine the code and contribute to it if you wish.
|
200
|
+
</p>
|
201
|
+
<p>
|
202
|
+
You can find the <a href="https://github.com/jaimeiniesta/w3clove">w3clove gem at Github</a>.
|
203
|
+
</p>
|
204
|
+
|
205
|
+
<h4 id="is_there_an_api">Is there an API?</h4>
|
206
|
+
<p>
|
207
|
+
Yes! I've started building an API. It's not finished yet, but you can already validate sitemaps and pages with it. Read more about it at the <a href="/api_v1_reference">API V1 Reference</a> page.
|
208
|
+
</p>
|
209
|
+
|
210
|
+
</div>
|
211
|
+
</div>
|
212
|
+
|
213
|
+
|
214
|
+
<div class="row footer">
|
215
|
+
<div class="content">
|
216
|
+
<footer>
|
217
|
+
<p>
|
218
|
+
<!-- Begin MailChimp Signup Form -->
|
219
|
+
<div id="mc_embed_signup">
|
220
|
+
<form action="http://w3clove.us4.list-manage.com/subscribe/post?u=6af3ab69c286561d0f0f25671&id=04a0dab609" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate shadowins">
|
221
|
+
<label for="mce-EMAIL">Subscribe to our newsletter:</label>
|
222
|
+
<input type="email" value="your email" name="EMAIL" class="email" id="mce-EMAIL" placeholder="your email" onfocus="this.value='';" required>
|
223
|
+
<div class="clear"><input type="submit" value="♥" name="subscribe" id="mc-embedded-subscribe" class="button btn"></div>
|
224
|
+
</form>
|
225
|
+
</div>
|
226
|
+
<!--End mc_embed_signup-->
|
227
|
+
|
228
|
+
<ul class="social_share">
|
229
|
+
<li class="twitter_follow"><a href="https://twitter.com/w3clove" class="twitter-follow-button" data-button="grey" data-text-color="#FFFFFF" data-link-color="#999999" data-show-count="false">Follow</a></li>
|
230
|
+
<li class="tweets_count"><a href="http://twitter.com/share" style="display:block;" class="twitter-share-button" data-count="horizontal" data-via="w3clove" data-lang="en">Tweet</a><script type="text/javascript" src="http://platform.twitter.com/widgets.js"></script></li>
|
231
|
+
<li class="gplus_count"><div class="g-plusone" data-size="medium" data-count="true"></div></li>
|
232
|
+
</ul>
|
233
|
+
</p>
|
234
|
+
<p class="clearb"><strong>W3CLove</strong> lets you <strong>validate entire sites</strong> with one click. This is an independent project, not associated with the W3C. By making use of this website you agree to the <a href="/terms_of_service">Terms of service</a>.<br /><br />Follow us on <a href="http://twitter.com/W3CLove">Twitter</a> and <a href="http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609">subscribe to our monthly newsletter</a>.</p>
|
235
|
+
</footer>
|
236
|
+
</div>
|
237
|
+
</div>
|
238
|
+
|
239
|
+
<script type="text/javascript">
|
240
|
+
var uvOptions = {};
|
241
|
+
(function() {
|
242
|
+
var uv = document.createElement('script'); uv.type = 'text/javascript'; uv.async = true;
|
243
|
+
uv.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + 'widget.uservoice.com/nhy6YD24GjgADgFX3h5z4w.js';
|
244
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(uv, s);
|
245
|
+
})();
|
246
|
+
</script>
|
247
|
+
<script type="text/javascript">
|
248
|
+
//<![CDATA[
|
249
|
+
|
250
|
+
var menuVisible = false;
|
251
|
+
$('.nav_button').on('click', showHideMenu);
|
252
|
+
|
253
|
+
//]]>
|
254
|
+
</script>
|
255
|
+
|
256
|
+
<script type="text/javascript">if (!NREUMQ.f) { NREUMQ.f=function() {
|
257
|
+
NREUMQ.push(["load",new Date().getTime()]);
|
258
|
+
var e=document.createElement("script");
|
259
|
+
e.type="text/javascript";e.async=true;e.src="https://d1ros97qkrwjf5.cloudfront.net/39/eum/rum.js";
|
260
|
+
document.body.appendChild(e);
|
261
|
+
if(NREUMQ.a)NREUMQ.a();
|
262
|
+
};
|
263
|
+
NREUMQ.a=window.onload;window.onload=NREUMQ.f;
|
264
|
+
};
|
265
|
+
NREUMQ.push(["nrfj","beacon-1.newrelic.com","96fc3f1db6",415027,"c1hbQUcNWlhQQhsNWVdfakNaDkJVUlUbFVFXUkYaRgpZQw==",0.0,14,new Date().getTime(),"","","","",""])</script></body>
|
266
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -14,6 +14,7 @@ describe MetaInspector do
|
|
14
14
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
15
15
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
16
16
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
17
|
+
FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
|
17
18
|
|
18
19
|
describe 'Initialization' do
|
19
20
|
it 'should accept an URL with a scheme' do
|
@@ -30,6 +31,16 @@ describe MetaInspector do
|
|
30
31
|
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
|
31
32
|
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
|
32
33
|
end
|
34
|
+
|
35
|
+
it "should store the host" do
|
36
|
+
MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
|
37
|
+
MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should store the root url" do
|
41
|
+
MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
42
|
+
MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
|
43
|
+
end
|
33
44
|
end
|
34
45
|
|
35
46
|
describe 'Doing a basic scrape' do
|
@@ -54,8 +65,7 @@ describe MetaInspector do
|
|
54
65
|
end
|
55
66
|
|
56
67
|
it "should find all page images" do
|
57
|
-
@m.
|
58
|
-
@m.images == ["/images/pagerank_alert.png?1309512337"]
|
68
|
+
@m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
|
59
69
|
end
|
60
70
|
|
61
71
|
it "should ignore malformed image tags" do
|
@@ -97,29 +107,41 @@ describe MetaInspector do
|
|
97
107
|
end
|
98
108
|
|
99
109
|
it "should get the links" do
|
100
|
-
@m.links.should == [
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
110
|
+
@m.links.should == [ "http://pagerankalert.com/",
|
111
|
+
"http://pagerankalert.com/es?language=es",
|
112
|
+
"http://pagerankalert.com/users/sign_up",
|
113
|
+
"http://pagerankalert.com/users/sign_in",
|
114
|
+
"mailto:pagerankalert@gmail.com",
|
115
|
+
"http://pagerankalert.posterous.com",
|
116
|
+
"http://twitter.com/pagerankalert",
|
117
|
+
"http://twitter.com/share" ]
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should get correct absolute links for internal pages" do
|
121
|
+
m = MetaInspector.new('http://w3clove.com/faqs')
|
122
|
+
m.links.should == [ "http://w3clove.com/faqs/#",
|
123
|
+
"http://w3clove.com/",
|
124
|
+
"http://w3clove.com/faqs",
|
125
|
+
"http://w3clove.com/plans-and-pricing",
|
126
|
+
"http://w3clove.com/contact",
|
127
|
+
"http://w3clove.com/charts/errors",
|
128
|
+
"http://w3clove.com/credits",
|
129
|
+
"http://w3clove.com/signin",
|
130
|
+
"http://validator.w3.org",
|
131
|
+
"http://www.sitemaps.org/",
|
132
|
+
"http://jaimeiniesta.com/",
|
133
|
+
"http://mendicantuniversity.org/",
|
134
|
+
"http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
|
135
|
+
"http://majesticseacreature.com/",
|
136
|
+
"http://school.mendicantuniversity.org/alumni/2011",
|
137
|
+
"https://github.com/jaimeiniesta/w3clove",
|
138
|
+
"http://w3clove.com",
|
139
|
+
"http://w3clove.com/api_v1_reference",
|
140
|
+
"https://twitter.com/w3clove",
|
141
|
+
"http://twitter.com/share",
|
142
|
+
"http://w3clove.com/terms_of_service",
|
143
|
+
"http://twitter.com/W3CLove",
|
144
|
+
"http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
|
123
145
|
end
|
124
146
|
end
|
125
147
|
|
@@ -138,10 +160,6 @@ describe MetaInspector do
|
|
138
160
|
"telnet://telnet.cdrom.com"
|
139
161
|
]
|
140
162
|
end
|
141
|
-
|
142
|
-
it "should return the same links as absolute links do" do
|
143
|
-
@m.absolute_links.should == @m.links
|
144
|
-
end
|
145
163
|
end
|
146
164
|
|
147
165
|
describe 'Protocol-relative URLs' do
|
@@ -151,13 +169,13 @@ describe MetaInspector do
|
|
151
169
|
end
|
152
170
|
|
153
171
|
it "should convert protocol-relative links to http" do
|
154
|
-
@m_http.
|
155
|
-
@m_http.
|
172
|
+
@m_http.links.should include('http://protocol-relative.com/contact')
|
173
|
+
@m_http.links.should include('http://yahoo.com')
|
156
174
|
end
|
157
175
|
|
158
176
|
it "should convert protocol-relative links to https" do
|
159
|
-
@m_https.
|
160
|
-
@m_https.
|
177
|
+
@m_https.links.should include('https://protocol-relative.com/contact')
|
178
|
+
@m_https.links.should include('https://yahoo.com')
|
161
179
|
end
|
162
180
|
end
|
163
181
|
|
@@ -227,7 +245,7 @@ describe MetaInspector do
|
|
227
245
|
describe 'to_hash' do
|
228
246
|
it "should return a hash with all the values set" do
|
229
247
|
@m = MetaInspector.new('http://pagerankalert.com')
|
230
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "
|
248
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
231
249
|
end
|
232
250
|
end
|
233
251
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 59
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 4
|
10
|
+
version: 1.9.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-07-
|
18
|
+
date: 2012-07-23 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- spec/fixtures/tea-tron.com.response
|
163
163
|
- spec/fixtures/theonion-no-description.com.response
|
164
164
|
- spec/fixtures/theonion.com.response
|
165
|
+
- spec/fixtures/w3clove_faqs.response
|
165
166
|
- spec/fixtures/youtube.response
|
166
167
|
- spec/metainspector_spec.rb
|
167
168
|
- spec/spec_helper.rb
|