metainspector 1.9.3 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +4 -4
- data/lib/meta_inspector/scraper.rb +35 -15
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/w3clove_faqs.response +266 -0
- data/spec/metainspector_spec.rb +52 -34
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -32,15 +32,15 @@ Then you can see the scraped data like this:
|
|
32
32
|
|
33
33
|
page.url # URL of the page
|
34
34
|
page.scheme # Scheme of the page (http, https)
|
35
|
+
page.host # Hostname of the page (like, w3clove.com, without the scheme)
|
36
|
+
page.root_url # Root url (scheme + host, like http://w3clove.com/)
|
35
37
|
page.title # title of the page, as string
|
36
|
-
page.links # array of strings, with every link found on the page
|
37
|
-
page.absolute_links # array of all the links converted to absolute urls
|
38
|
+
page.links # array of strings, with every link found on the page as an absolute URL
|
38
39
|
page.meta_description # meta description, as string
|
39
40
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
40
41
|
page.meta_keywords # meta keywords, as string
|
41
42
|
page.image # Most relevant image, if defined with og:image
|
42
|
-
page.images # array of strings, with every img found on the page
|
43
|
-
page.absolute_images # array of all the images converted to absolute urls
|
43
|
+
page.images # array of strings, with every img found on the page as an absolute URL
|
44
44
|
page.feed # Get rss or atom links in meta data fields as array
|
45
45
|
page.meta_og_title # opengraph title
|
46
46
|
page.meta_og_image # opengraph image
|
@@ -9,13 +9,15 @@ require 'timeout'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url, :scheme, :errors
|
12
|
+
attr_reader :url, :scheme, :host, :root_url, :errors
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
15
|
|
16
16
|
def initialize(url, timeout = 20)
|
17
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
18
|
@scheme = URI.parse(url).scheme || 'http'
|
19
|
+
@host = URI.parse(url).host
|
20
|
+
@root_url = "#{@scheme}://#{@host}/"
|
19
21
|
@timeout = timeout
|
20
22
|
@data = Hashie::Rash.new('url' => @url)
|
21
23
|
@errors = []
|
@@ -33,26 +35,24 @@ module MetaInspector
|
|
33
35
|
meta_description.nil? ? secondary_description : meta_description
|
34
36
|
end
|
35
37
|
|
36
|
-
#
|
38
|
+
# Links found on the page, as absolute URLs
|
37
39
|
def links
|
38
|
-
@data.links ||=
|
39
|
-
.map {|link| link.attributes["href"] \
|
40
|
-
.to_s.strip}.uniq rescue nil
|
40
|
+
@data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
46
|
-
.map{ |i| i.attributes['src'].value }.uniq
|
43
|
+
def absolute_links
|
44
|
+
warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
|
45
|
+
links
|
47
46
|
end
|
48
47
|
|
49
|
-
#
|
50
|
-
def
|
51
|
-
@data.
|
48
|
+
# Images found on the page, as absolute URLs
|
49
|
+
def images
|
50
|
+
@data.images ||= parsed_images.map{ |i| absolutify_url(i) }
|
52
51
|
end
|
53
52
|
|
54
53
|
def absolute_images
|
55
|
-
|
54
|
+
warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
|
55
|
+
images
|
56
56
|
end
|
57
57
|
|
58
58
|
# Returns the parsed document meta rss links
|
@@ -81,7 +81,7 @@ module MetaInspector
|
|
81
81
|
# Returns all parsed data as a nested Hash
|
82
82
|
def to_hash
|
83
83
|
# TODO: find a better option to populate the data to the Hash
|
84
|
-
image;feed;links;charset;
|
84
|
+
image;images;feed;links;charset;title;meta_keywords
|
85
85
|
@data.to_hash
|
86
86
|
end
|
87
87
|
|
@@ -146,6 +146,18 @@ module MetaInspector
|
|
146
146
|
|
147
147
|
private
|
148
148
|
|
149
|
+
def parsed_links
|
150
|
+
@parsed_links ||= parsed_document.search("//a") \
|
151
|
+
.map {|link| link.attributes["href"] \
|
152
|
+
.to_s.strip}.uniq rescue nil
|
153
|
+
end
|
154
|
+
|
155
|
+
def parsed_images
|
156
|
+
@parsed_images ||= parsed_document.search('//img') \
|
157
|
+
.reject{|i| i.attributes['src'].blank? } \
|
158
|
+
.map{ |i| i.attributes['src'].value }.uniq
|
159
|
+
end
|
160
|
+
|
149
161
|
# Stores the error for later inspection
|
150
162
|
def add_fatal_error(error)
|
151
163
|
@errors << error
|
@@ -154,7 +166,15 @@ module MetaInspector
|
|
154
166
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
155
167
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
156
168
|
def absolutify_url(url)
|
157
|
-
url =~ /^\w*\:/i
|
169
|
+
if url =~ /^\w*\:/i
|
170
|
+
url
|
171
|
+
else
|
172
|
+
if url[0] == "/"
|
173
|
+
File.join(@root_url, url)
|
174
|
+
else
|
175
|
+
File.join(@url, url)
|
176
|
+
end
|
177
|
+
end
|
158
178
|
end
|
159
179
|
|
160
180
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
@@ -0,0 +1,266 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx
|
3
|
+
Date: Mon, 23 Jul 2012 08:44:12 GMT
|
4
|
+
Content-Type: text/html; charset=utf-8
|
5
|
+
Connection: keep-alive
|
6
|
+
Status: 200 OK
|
7
|
+
X-Ua-Compatible: IE=Edge,chrome=1
|
8
|
+
Etag: "c4f3d4aaf12acce6a909714618e08934"
|
9
|
+
Cache-Control: max-age=0, private, must-revalidate
|
10
|
+
Set-Cookie: _w3clovesite_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFRkkiJTJiMWU0NzVkNjJjNDliMDRlZGI3MjI5OTVlN2U4MjU5BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMTlWUmVSMEVlTWNuV0t4cTFuNHUvQVozZCttMjhxRTEvWFhYYW5hOXRFdUk9BjsARg%3D%3D--7e9d3e900c9531363297f469f8baa3e3ed31336a; path=/; HttpOnly
|
11
|
+
X-Request-Id: 33ca78a4044d244e673d273a59fa4ebc
|
12
|
+
X-Runtime: 0.017688
|
13
|
+
X-Rack-Cache: miss
|
14
|
+
Content-Length: 12923
|
15
|
+
X-Varnish: 647613022
|
16
|
+
Age: 0
|
17
|
+
Via: 1.1 varnish
|
18
|
+
|
19
|
+
<!DOCTYPE html>
|
20
|
+
<html>
|
21
|
+
|
22
|
+
<head><script type="text/javascript">var NREUMQ=NREUMQ||[];NREUMQ.push(["mark","firstbyte",new Date().getTime()]);</script>
|
23
|
+
<meta charset="UTF-8" />
|
24
|
+
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
|
25
|
+
<title>Whole site HTML validator | W3CLove</title>
|
26
|
+
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />
|
27
|
+
<meta name="description" content="Site-wide markup validation tool. Validate the markup of your whole site with just one click." />
|
28
|
+
<meta name="keywords" content="html, markup, validation, validator, tool, w3c, development, standards, free" />
|
29
|
+
<link href="http://cdn-images.mailchimp.com/embedcode/slim-081711.css" rel="stylesheet" type="text/css">
|
30
|
+
<link href="http://fonts.googleapis.com/css?family=Terminal+Dosis:400,600" rel="stylesheet" type="text/css" />
|
31
|
+
<link href="/assets/application-9da2f67bc1bc6e19a801cb7685a0b497.css" media="screen" rel="stylesheet" type="text/css" />
|
32
|
+
<meta content="authenticity_token" name="csrf-param" />
|
33
|
+
<meta content="9VReR0EeMcnWKxq1n4u/AZ3d+m28qE1/XXXana9tEuI=" name="csrf-token" />
|
34
|
+
<script src="/assets/application-4e8aa1a929a0aeab6bdf339edecbeaa6.js" type="text/javascript"></script>
|
35
|
+
<script src="/assets/pages-7270767b2a9e9fff880aa5de378ca791.js" type="text/javascript"></script>
|
36
|
+
<script src="https://apis.google.com/js/plusone.js" type="text/javascript"></script>
|
37
|
+
|
38
|
+
<script type="text/javascript">
|
39
|
+
|
40
|
+
var _gaq = _gaq || [];
|
41
|
+
_gaq.push(['_setAccount', 'UA-122379-37']);
|
42
|
+
_gaq.push(['_trackPageview']);
|
43
|
+
|
44
|
+
(function() {
|
45
|
+
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
46
|
+
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
47
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
48
|
+
})();
|
49
|
+
|
50
|
+
</script>
|
51
|
+
</head>
|
52
|
+
|
53
|
+
<body>
|
54
|
+
|
55
|
+
<div id="flash_message"><div><span id="flash_message_text"></span><a class="close-message" href="#" onclick="closeMsgBar();">X</a></div></div>
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
<div class="row top-bar">
|
60
|
+
<div class="content">
|
61
|
+
<header>
|
62
|
+
<div class="header">
|
63
|
+
<h1><a href="/" title="W3CLove">W3CLove</a></h1>
|
64
|
+
<nav>
|
65
|
+
<a href="#" class="nav_button"></a>
|
66
|
+
<ul class="nav">
|
67
|
+
<li><a href="/faqs" class='active'>FAQs</a></li>
|
68
|
+
<li><a href="/plans-and-pricing" >Plans and pricing</a></li>
|
69
|
+
<li><a href="/contact" >Contact</a></li>
|
70
|
+
<li><a href="/charts/errors" >Top 100 Errors</a></li>
|
71
|
+
</ul>
|
72
|
+
</nav>
|
73
|
+
|
74
|
+
|
75
|
+
<div id="sign_in">
|
76
|
+
<a href="/credits" class="label warning" id="credits_count">10 Credits</a>
|
77
|
+
|
78
|
+
<a href="/signin" class="label success">Sign in</a>
|
79
|
+
</div>
|
80
|
+
</div>
|
81
|
+
</header>
|
82
|
+
</div>
|
83
|
+
</div>
|
84
|
+
|
85
|
+
<div id="faqs" class="row hero">
|
86
|
+
|
87
|
+
<div class="content">
|
88
|
+
|
89
|
+
<h2>FAQs</h2>
|
90
|
+
<h3>Frequently Asked Questions</h3>
|
91
|
+
|
92
|
+
</div>
|
93
|
+
|
94
|
+
</div>
|
95
|
+
|
96
|
+
|
97
|
+
<div class="row description">
|
98
|
+
|
99
|
+
<div class="content">
|
100
|
+
|
101
|
+
<h4 id="what_is_w3clove">What is W3CLove?</h4>
|
102
|
+
<p>
|
103
|
+
W3CLove is a site-wide markup validation tool. It allows you to check the validity of the markup of several pages
|
104
|
+
from your website, and gives you a summary of the most common errors and warnings, with a single click.
|
105
|
+
</p>
|
106
|
+
|
107
|
+
<h4 id="why_not_just_use_the_official_w3c_validator">Why not just use the official W3C validator?</h4>
|
108
|
+
<p>
|
109
|
+
The <a href="http://validator.w3.org">official W3C validator</a> does not yet provide a way to submit several URLs at once. So, if you want to check your whole
|
110
|
+
website, you need to submit each of your URLs individually, which is a slow process. W3CLove provides a simpler, faster way to
|
111
|
+
submit several pages at once.
|
112
|
+
</p>
|
113
|
+
|
114
|
+
<h4 id="how_can_i_submit_my_site_for_validation">How can I submit a site for validation?</h4>
|
115
|
+
<p>
|
116
|
+
To submit a site, just enter its URL on the <a href="/">front page</a>, and click the "Validate" button. The W3CLove spider will crawl the site
|
117
|
+
in search for internal links, validate each of them, and then compile all errors and warnings in one summary.
|
118
|
+
</p>
|
119
|
+
|
120
|
+
<h4 id="how_can_i_specify_the_exact_urls_i_want_to_validate">How can I specify the exact URLs I want to validate?</h4>
|
121
|
+
<p>
|
122
|
+
The W3CLove spider will crawl the provided URL in search for internal links, but you can also provide an <a href="http://www.sitemaps.org/">XML sitemap</a> with the exact URLs
|
123
|
+
you need to validate.
|
124
|
+
</p>
|
125
|
+
|
126
|
+
<h4 id="is_there_a_limit_on_the_number_of_urls">Is there a limit on the number of URLs to validate?</h4>
|
127
|
+
<p>
|
128
|
+
Yes, there is a limit of 250 URLs per each sitemap submitted. This should be enough for most sites to get a good idea of the
|
129
|
+
validation status of the site, and saves processing time for both W3CLove and the W3C validator.
|
130
|
+
</p>
|
131
|
+
|
132
|
+
<h4 id="how_can_i_resubmit_a_site">How can I resubmit a site?</h4>
|
133
|
+
<p>
|
134
|
+
Just click on the "Re-check" buttons. You can recheck the whole sitemap or individual pages.
|
135
|
+
</p>
|
136
|
+
|
137
|
+
<h4 id="can_i_store_my_sitemaps_list">Can I store my sitemaps list?</h4>
|
138
|
+
<p>
|
139
|
+
Yes, W3CLove lets you store for free a list of the sitemaps you're interested in validating. Just sign in with your Twitter, Facebook or Google account and every sitemap you validate will appear on your sitemaps list.
|
140
|
+
</p>
|
141
|
+
|
142
|
+
<h4 id="how_do_credits_work">How do credits work?</h4>
|
143
|
+
<p>
|
144
|
+
When you sign up for the first time at W3CLove, you're given 100 initial credits so you can try the service for free.
|
145
|
+
</p>
|
146
|
+
<p>
|
147
|
+
For every single web page validation that you make using our service, you're charged 1 credit. So, for example, if you start with 100 credits and you validate a site that has 30 web pages, you end up with 70 credits.
|
148
|
+
</p>
|
149
|
+
<p>
|
150
|
+
Once you spend all your credits, you can't make more validations until you recharge them.
|
151
|
+
</p>
|
152
|
+
|
153
|
+
<h4 id="how_can_i_recharge_my_credits">How can I recharge my credits?</h4>
|
154
|
+
<p>
|
155
|
+
The easiest way to recharge your credits is through a monthly subscription.
|
156
|
+
</p>
|
157
|
+
<p>
|
158
|
+
This way, your credits will be recharged every month up to the monthly limit of your chosen plan.
|
159
|
+
</p>
|
160
|
+
<p>
|
161
|
+
Check out the <a href="/plans-and-pricing">Plans and pricing</a> page to see what plan is best for you. If you're not sure about how many validations you need, you can buy packs of validations.
|
162
|
+
</p>
|
163
|
+
|
164
|
+
<h4 id="how_can_i_sign_in_with_another_account">I've signed in from one account, how can I sign in with a different one?</h4>
|
165
|
+
<p>
|
166
|
+
For your convenience, you're first shown 3 ways to sign in: Twitter, Facebook and Google. When you use one of those, W3CLove will remember your preference and offer just this one.
|
167
|
+
</p>
|
168
|
+
<p>
|
169
|
+
If you'd like to change this preference, just sign in again with your preferred account:<br/>
|
170
|
+
<a href='#' onclick='window.location="/auth/twitter"; return false;' style='color:white;'><span class='label success'>twitter</span></a>, <a href='#' onclick='window.location="/auth/facebook"; return false;' style='color:white;'><span class='label success'>facebook</span></a> or <a href='#' onclick='window.location="/auth/google_oauth2"; return false;' style='color:white;'><span class='label success'>google</span></a>.
|
171
|
+
</p>
|
172
|
+
|
173
|
+
<h4 id="who_is_behind_all_this">Who is behind all this?</h4>
|
174
|
+
<p>
|
175
|
+
W3CLove is a personal project maintained by <a href="http://jaimeiniesta.com/">Jaime Iniesta</a>, an independent web developer who loves working with Ruby on Rails. That's me. :)
|
176
|
+
</p>
|
177
|
+
|
178
|
+
<h4 id="how_did_this_project_start">How did this project start?</h4>
|
179
|
+
<p>
|
180
|
+
During March 2011 I took the Ruby Core Skills course at the <a href="http://mendicantuniversity.org/">Mendicant University</a>, an intense three week course that takes you through several important topic areas every Ruby developer should be comfortable on. You can <a href="http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby">read more</a> about it at my blog.
|
181
|
+
</p>
|
182
|
+
<p>
|
183
|
+
With the help of <a href="http://majesticseacreature.com/">Gregory Brown</a> and the rest of the <a href="http://school.mendicantuniversity.org/alumni/2011">Mendicant University Alumni</a>, I built the <a href="https://github.com/jaimeiniesta/w3clove">w3clove</a> gem that allows you to do site-wide markup validation from the command line.
|
184
|
+
</p>
|
185
|
+
<p>
|
186
|
+
Afterwards, I built this <a href="http://w3clove.com">W3CLove.com</a> site to make it easier for everyone to do site-wide markup validation, with a nicer HTML interface, storing the results for later, rechecking, etc.
|
187
|
+
</p>
|
188
|
+
<p>
|
189
|
+
I want to express my gratitude to all the Mendicant University community, all of them are still helping me making W3CLove a better tool for everyone. Thank you!
|
190
|
+
</p>
|
191
|
+
|
192
|
+
<h4 id="is_this_free">Is this free?</h4>
|
193
|
+
<p>
|
194
|
+
No, this is a paid service, but you can try it for free.
|
195
|
+
</p>
|
196
|
+
|
197
|
+
<h4 id="is_there_an_open_source_version">Is there an open source version?</h4>
|
198
|
+
<p>
|
199
|
+
Yes! There's a free, standalone version that you can install on your computer. It's packed as a Ruby gem and it's open source, so you can examine the code and contribute to it if you wish.
|
200
|
+
</p>
|
201
|
+
<p>
|
202
|
+
You can find the <a href="https://github.com/jaimeiniesta/w3clove">w3clove gem at Github</a>.
|
203
|
+
</p>
|
204
|
+
|
205
|
+
<h4 id="is_there_an_api">Is there an API?</h4>
|
206
|
+
<p>
|
207
|
+
Yes! I've started building an API. It's not finished yet, but you can already validate sitemaps and pages with it. Read more about it at the <a href="/api_v1_reference">API V1 Reference</a> page.
|
208
|
+
</p>
|
209
|
+
|
210
|
+
</div>
|
211
|
+
</div>
|
212
|
+
|
213
|
+
|
214
|
+
<div class="row footer">
|
215
|
+
<div class="content">
|
216
|
+
<footer>
|
217
|
+
<p>
|
218
|
+
<!-- Begin MailChimp Signup Form -->
|
219
|
+
<div id="mc_embed_signup">
|
220
|
+
<form action="http://w3clove.us4.list-manage.com/subscribe/post?u=6af3ab69c286561d0f0f25671&id=04a0dab609" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate shadowins">
|
221
|
+
<label for="mce-EMAIL">Subscribe to our newsletter:</label>
|
222
|
+
<input type="email" value="your email" name="EMAIL" class="email" id="mce-EMAIL" placeholder="your email" onfocus="this.value='';" required>
|
223
|
+
<div class="clear"><input type="submit" value="♥" name="subscribe" id="mc-embedded-subscribe" class="button btn"></div>
|
224
|
+
</form>
|
225
|
+
</div>
|
226
|
+
<!--End mc_embed_signup-->
|
227
|
+
|
228
|
+
<ul class="social_share">
|
229
|
+
<li class="twitter_follow"><a href="https://twitter.com/w3clove" class="twitter-follow-button" data-button="grey" data-text-color="#FFFFFF" data-link-color="#999999" data-show-count="false">Follow</a></li>
|
230
|
+
<li class="tweets_count"><a href="http://twitter.com/share" style="display:block;" class="twitter-share-button" data-count="horizontal" data-via="w3clove" data-lang="en">Tweet</a><script type="text/javascript" src="http://platform.twitter.com/widgets.js"></script></li>
|
231
|
+
<li class="gplus_count"><div class="g-plusone" data-size="medium" data-count="true"></div></li>
|
232
|
+
</ul>
|
233
|
+
</p>
|
234
|
+
<p class="clearb"><strong>W3CLove</strong> lets you <strong>validate entire sites</strong> with one click. This is an independent project, not associated with the W3C. By making use of this website you agree to the <a href="/terms_of_service">Terms of service</a>.<br /><br />Follow us on <a href="http://twitter.com/W3CLove">Twitter</a> and <a href="http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609">subscribe to our monthly newsletter</a>.</p>
|
235
|
+
</footer>
|
236
|
+
</div>
|
237
|
+
</div>
|
238
|
+
|
239
|
+
<script type="text/javascript">
|
240
|
+
var uvOptions = {};
|
241
|
+
(function() {
|
242
|
+
var uv = document.createElement('script'); uv.type = 'text/javascript'; uv.async = true;
|
243
|
+
uv.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + 'widget.uservoice.com/nhy6YD24GjgADgFX3h5z4w.js';
|
244
|
+
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(uv, s);
|
245
|
+
})();
|
246
|
+
</script>
|
247
|
+
<script type="text/javascript">
|
248
|
+
//<![CDATA[
|
249
|
+
|
250
|
+
var menuVisible = false;
|
251
|
+
$('.nav_button').on('click', showHideMenu);
|
252
|
+
|
253
|
+
//]]>
|
254
|
+
</script>
|
255
|
+
|
256
|
+
<script type="text/javascript">if (!NREUMQ.f) { NREUMQ.f=function() {
|
257
|
+
NREUMQ.push(["load",new Date().getTime()]);
|
258
|
+
var e=document.createElement("script");
|
259
|
+
e.type="text/javascript";e.async=true;e.src="https://d1ros97qkrwjf5.cloudfront.net/39/eum/rum.js";
|
260
|
+
document.body.appendChild(e);
|
261
|
+
if(NREUMQ.a)NREUMQ.a();
|
262
|
+
};
|
263
|
+
NREUMQ.a=window.onload;window.onload=NREUMQ.f;
|
264
|
+
};
|
265
|
+
NREUMQ.push(["nrfj","beacon-1.newrelic.com","96fc3f1db6",415027,"c1hbQUcNWlhQQhsNWVdfakNaDkJVUlUbFVFXUkYaRgpZQw==",0.0,14,new Date().getTime(),"","","","",""])</script></body>
|
266
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -14,6 +14,7 @@ describe MetaInspector do
|
|
14
14
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
15
15
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
16
16
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
17
|
+
FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
|
17
18
|
|
18
19
|
describe 'Initialization' do
|
19
20
|
it 'should accept an URL with a scheme' do
|
@@ -30,6 +31,16 @@ describe MetaInspector do
|
|
30
31
|
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
|
31
32
|
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
|
32
33
|
end
|
34
|
+
|
35
|
+
it "should store the host" do
|
36
|
+
MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
|
37
|
+
MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should store the root url" do
|
41
|
+
MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
|
42
|
+
MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
|
43
|
+
end
|
33
44
|
end
|
34
45
|
|
35
46
|
describe 'Doing a basic scrape' do
|
@@ -54,8 +65,7 @@ describe MetaInspector do
|
|
54
65
|
end
|
55
66
|
|
56
67
|
it "should find all page images" do
|
57
|
-
@m.
|
58
|
-
@m.images == ["/images/pagerank_alert.png?1309512337"]
|
68
|
+
@m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
|
59
69
|
end
|
60
70
|
|
61
71
|
it "should ignore malformed image tags" do
|
@@ -97,29 +107,41 @@ describe MetaInspector do
|
|
97
107
|
end
|
98
108
|
|
99
109
|
it "should get the links" do
|
100
|
-
@m.links.should == [
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
110
|
+
@m.links.should == [ "http://pagerankalert.com/",
|
111
|
+
"http://pagerankalert.com/es?language=es",
|
112
|
+
"http://pagerankalert.com/users/sign_up",
|
113
|
+
"http://pagerankalert.com/users/sign_in",
|
114
|
+
"mailto:pagerankalert@gmail.com",
|
115
|
+
"http://pagerankalert.posterous.com",
|
116
|
+
"http://twitter.com/pagerankalert",
|
117
|
+
"http://twitter.com/share" ]
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should get correct absolute links for internal pages" do
|
121
|
+
m = MetaInspector.new('http://w3clove.com/faqs')
|
122
|
+
m.links.should == [ "http://w3clove.com/faqs/#",
|
123
|
+
"http://w3clove.com/",
|
124
|
+
"http://w3clove.com/faqs",
|
125
|
+
"http://w3clove.com/plans-and-pricing",
|
126
|
+
"http://w3clove.com/contact",
|
127
|
+
"http://w3clove.com/charts/errors",
|
128
|
+
"http://w3clove.com/credits",
|
129
|
+
"http://w3clove.com/signin",
|
130
|
+
"http://validator.w3.org",
|
131
|
+
"http://www.sitemaps.org/",
|
132
|
+
"http://jaimeiniesta.com/",
|
133
|
+
"http://mendicantuniversity.org/",
|
134
|
+
"http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
|
135
|
+
"http://majesticseacreature.com/",
|
136
|
+
"http://school.mendicantuniversity.org/alumni/2011",
|
137
|
+
"https://github.com/jaimeiniesta/w3clove",
|
138
|
+
"http://w3clove.com",
|
139
|
+
"http://w3clove.com/api_v1_reference",
|
140
|
+
"https://twitter.com/w3clove",
|
141
|
+
"http://twitter.com/share",
|
142
|
+
"http://w3clove.com/terms_of_service",
|
143
|
+
"http://twitter.com/W3CLove",
|
144
|
+
"http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
|
123
145
|
end
|
124
146
|
end
|
125
147
|
|
@@ -138,10 +160,6 @@ describe MetaInspector do
|
|
138
160
|
"telnet://telnet.cdrom.com"
|
139
161
|
]
|
140
162
|
end
|
141
|
-
|
142
|
-
it "should return the same links as absolute links do" do
|
143
|
-
@m.absolute_links.should == @m.links
|
144
|
-
end
|
145
163
|
end
|
146
164
|
|
147
165
|
describe 'Protocol-relative URLs' do
|
@@ -151,13 +169,13 @@ describe MetaInspector do
|
|
151
169
|
end
|
152
170
|
|
153
171
|
it "should convert protocol-relative links to http" do
|
154
|
-
@m_http.
|
155
|
-
@m_http.
|
172
|
+
@m_http.links.should include('http://protocol-relative.com/contact')
|
173
|
+
@m_http.links.should include('http://yahoo.com')
|
156
174
|
end
|
157
175
|
|
158
176
|
it "should convert protocol-relative links to https" do
|
159
|
-
@m_https.
|
160
|
-
@m_https.
|
177
|
+
@m_https.links.should include('https://protocol-relative.com/contact')
|
178
|
+
@m_https.links.should include('https://yahoo.com')
|
161
179
|
end
|
162
180
|
end
|
163
181
|
|
@@ -227,7 +245,7 @@ describe MetaInspector do
|
|
227
245
|
describe 'to_hash' do
|
228
246
|
it "should return a hash with all the values set" do
|
229
247
|
@m = MetaInspector.new('http://pagerankalert.com')
|
230
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "
|
248
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
231
249
|
end
|
232
250
|
end
|
233
251
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 59
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 4
|
10
|
+
version: 1.9.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-07-
|
18
|
+
date: 2012-07-23 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- spec/fixtures/tea-tron.com.response
|
163
163
|
- spec/fixtures/theonion-no-description.com.response
|
164
164
|
- spec/fixtures/theonion.com.response
|
165
|
+
- spec/fixtures/w3clove_faqs.response
|
165
166
|
- spec/fixtures/youtube.response
|
166
167
|
- spec/metainspector_spec.rb
|
167
168
|
- spec/spec_helper.rb
|