metainspector 1.9.3 → 1.9.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -32,15 +32,15 @@ Then you can see the scraped data like this:
32
32
 
33
33
  page.url # URL of the page
34
34
  page.scheme # Scheme of the page (http, https)
35
+ page.host # Hostname of the page (like, w3clove.com, without the scheme)
36
+ page.root_url # Root url (scheme + host, like http://w3clove.com/)
35
37
  page.title # title of the page, as string
36
- page.links # array of strings, with every link found on the page
37
- page.absolute_links # array of all the links converted to absolute urls
38
+ page.links # array of strings, with every link found on the page as an absolute URL
38
39
  page.meta_description # meta description, as string
39
40
  page.description # returns the meta description, or the first long paragraph if no meta description is found
40
41
  page.meta_keywords # meta keywords, as string
41
42
  page.image # Most relevant image, if defined with og:image
42
- page.images # array of strings, with every img found on the page
43
- page.absolute_images # array of all the images converted to absolute urls
43
+ page.images # array of strings, with every img found on the page as an absolute URL
44
44
  page.feed # Get rss or atom links in meta data fields as array
45
45
  page.meta_og_title # opengraph title
46
46
  page.meta_og_image # opengraph image
@@ -9,13 +9,15 @@ require 'timeout'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :url, :scheme, :errors
12
+ attr_reader :url, :scheme, :host, :root_url, :errors
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
14
  # If no scheme given, set it to http:// by default
15
15
 
16
16
  def initialize(url, timeout = 20)
17
17
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
18
18
  @scheme = URI.parse(url).scheme || 'http'
19
+ @host = URI.parse(url).host
20
+ @root_url = "#{@scheme}://#{@host}/"
19
21
  @timeout = timeout
20
22
  @data = Hashie::Rash.new('url' => @url)
21
23
  @errors = []
@@ -33,26 +35,24 @@ module MetaInspector
33
35
  meta_description.nil? ? secondary_description : meta_description
34
36
  end
35
37
 
36
- # Returns the parsed document links
38
+ # Links found on the page, as absolute URLs
37
39
  def links
38
- @data.links ||= parsed_document.search("//a") \
39
- .map {|link| link.attributes["href"] \
40
- .to_s.strip}.uniq rescue nil
40
+ @data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
41
41
  end
42
42
 
43
- def images
44
- @data.images ||= parsed_document.search('//img') \
45
- .reject{|i| i.attributes['src'].blank? } \
46
- .map{ |i| i.attributes['src'].value }.uniq
43
+ def absolute_links
44
+ warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
45
+ links
47
46
  end
48
47
 
49
- # Returns the links converted to absolute urls
50
- def absolute_links
51
- @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
48
+ # Images found on the page, as absolute URLs
49
+ def images
50
+ @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
52
51
  end
53
52
 
54
53
  def absolute_images
55
- @data.absolute_images ||= images.map{ |i| absolutify_url(i) }
54
+ warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
55
+ images
56
56
  end
57
57
 
58
58
  # Returns the parsed document meta rss links
@@ -81,7 +81,7 @@ module MetaInspector
81
81
  # Returns all parsed data as a nested Hash
82
82
  def to_hash
83
83
  # TODO: find a better option to populate the data to the Hash
84
- image;feed;links;charset;absolute_links;title;meta_keywords
84
+ image;images;feed;links;charset;title;meta_keywords
85
85
  @data.to_hash
86
86
  end
87
87
 
@@ -146,6 +146,18 @@ module MetaInspector
146
146
 
147
147
  private
148
148
 
149
+ def parsed_links
150
+ @parsed_links ||= parsed_document.search("//a") \
151
+ .map {|link| link.attributes["href"] \
152
+ .to_s.strip}.uniq rescue nil
153
+ end
154
+
155
+ def parsed_images
156
+ @parsed_images ||= parsed_document.search('//img') \
157
+ .reject{|i| i.attributes['src'].blank? } \
158
+ .map{ |i| i.attributes['src'].value }.uniq
159
+ end
160
+
149
161
  # Stores the error for later inspection
150
162
  def add_fatal_error(error)
151
163
  @errors << error
@@ -154,7 +166,15 @@ module MetaInspector
154
166
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
155
167
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
156
168
  def absolutify_url(url)
157
- url =~ /^\w*\:/i ? url : File.join(@url,url)
169
+ if url =~ /^\w*\:/i
170
+ url
171
+ else
172
+ if url[0] == "/"
173
+ File.join(@root_url, url)
174
+ else
175
+ File.join(@url, url)
176
+ end
177
+ end
158
178
  end
159
179
 
160
180
  # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.9.3"
4
+ VERSION = "1.9.4"
5
5
  end
@@ -0,0 +1,266 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx
3
+ Date: Mon, 23 Jul 2012 08:44:12 GMT
4
+ Content-Type: text/html; charset=utf-8
5
+ Connection: keep-alive
6
+ Status: 200 OK
7
+ X-Ua-Compatible: IE=Edge,chrome=1
8
+ Etag: "c4f3d4aaf12acce6a909714618e08934"
9
+ Cache-Control: max-age=0, private, must-revalidate
10
+ Set-Cookie: _w3clovesite_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFRkkiJTJiMWU0NzVkNjJjNDliMDRlZGI3MjI5OTVlN2U4MjU5BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMTlWUmVSMEVlTWNuV0t4cTFuNHUvQVozZCttMjhxRTEvWFhYYW5hOXRFdUk9BjsARg%3D%3D--7e9d3e900c9531363297f469f8baa3e3ed31336a; path=/; HttpOnly
11
+ X-Request-Id: 33ca78a4044d244e673d273a59fa4ebc
12
+ X-Runtime: 0.017688
13
+ X-Rack-Cache: miss
14
+ Content-Length: 12923
15
+ X-Varnish: 647613022
16
+ Age: 0
17
+ Via: 1.1 varnish
18
+
19
+ <!DOCTYPE html>
20
+ <html>
21
+
22
+ <head><script type="text/javascript">var NREUMQ=NREUMQ||[];NREUMQ.push(["mark","firstbyte",new Date().getTime()]);</script>
23
+ <meta charset="UTF-8" />
24
+ <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
25
+ <title>Whole site HTML validator | W3CLove</title>
26
+ <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />
27
+ <meta name="description" content="Site-wide markup validation tool. Validate the markup of your whole site with just one click." />
28
+ <meta name="keywords" content="html, markup, validation, validator, tool, w3c, development, standards, free" />
29
+ <link href="http://cdn-images.mailchimp.com/embedcode/slim-081711.css" rel="stylesheet" type="text/css">
30
+ <link href="http://fonts.googleapis.com/css?family=Terminal+Dosis:400,600" rel="stylesheet" type="text/css" />
31
+ <link href="/assets/application-9da2f67bc1bc6e19a801cb7685a0b497.css" media="screen" rel="stylesheet" type="text/css" />
32
+ <meta content="authenticity_token" name="csrf-param" />
33
+ <meta content="9VReR0EeMcnWKxq1n4u/AZ3d+m28qE1/XXXana9tEuI=" name="csrf-token" />
34
+ <script src="/assets/application-4e8aa1a929a0aeab6bdf339edecbeaa6.js" type="text/javascript"></script>
35
+ <script src="/assets/pages-7270767b2a9e9fff880aa5de378ca791.js" type="text/javascript"></script>
36
+ <script src="https://apis.google.com/js/plusone.js" type="text/javascript"></script>
37
+
38
+ <script type="text/javascript">
39
+
40
+ var _gaq = _gaq || [];
41
+ _gaq.push(['_setAccount', 'UA-122379-37']);
42
+ _gaq.push(['_trackPageview']);
43
+
44
+ (function() {
45
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
46
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
47
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
48
+ })();
49
+
50
+ </script>
51
+ </head>
52
+
53
+ <body>
54
+
55
+ <div id="flash_message"><div><span id="flash_message_text"></span><a class="close-message" href="#" onclick="closeMsgBar();">X</a></div></div>
56
+
57
+
58
+
59
+ <div class="row top-bar">
60
+ <div class="content">
61
+ <header>
62
+ <div class="header">
63
+ <h1><a href="/" title="W3CLove">W3CLove</a></h1>
64
+ <nav>
65
+ <a href="#" class="nav_button"></a>
66
+ <ul class="nav">
67
+ <li><a href="/faqs" class='active'>FAQs</a></li>
68
+ <li><a href="/plans-and-pricing" >Plans and pricing</a></li>
69
+ <li><a href="/contact" >Contact</a></li>
70
+ <li><a href="/charts/errors" >Top 100 Errors</a></li>
71
+ </ul>
72
+ </nav>
73
+
74
+
75
+ <div id="sign_in">
76
+ <a href="/credits" class="label warning" id="credits_count">10 Credits</a>
77
+
78
+ <a href="/signin" class="label success">Sign in</a>
79
+ </div>
80
+ </div>
81
+ </header>
82
+ </div>
83
+ </div>
84
+
85
+ <div id="faqs" class="row hero">
86
+
87
+ <div class="content">
88
+
89
+ <h2>FAQs</h2>
90
+ <h3>Frequently Asked Questions</h3>
91
+
92
+ </div>
93
+
94
+ </div>
95
+
96
+
97
+ <div class="row description">
98
+
99
+ <div class="content">
100
+
101
+ <h4 id="what_is_w3clove">What is W3CLove?</h4>
102
+ <p>
103
+ W3CLove is a site-wide markup validation tool. It allows you to check the validity of the markup of several pages
104
+ from your website, and gives you a summary of the most common errors and warnings, with a single click.
105
+ </p>
106
+
107
+ <h4 id="why_not_just_use_the_official_w3c_validator">Why not just use the official W3C validator?</h4>
108
+ <p>
109
+ The <a href="http://validator.w3.org">official W3C validator</a> does not yet provide a way to submit several URLs at once. So, if you want to check your whole
110
+ website, you need to submit each of your URLs individually, which is a slow process. W3CLove provides a simpler, faster way to
111
+ submit several pages at once.
112
+ </p>
113
+
114
+ <h4 id="how_can_i_submit_my_site_for_validation">How can I submit a site for validation?</h4>
115
+ <p>
116
+ To submit a site, just enter its URL on the <a href="/">front page</a>, and click the "Validate" button. The W3CLove spider will crawl the site
117
+ in search for internal links, validate each of them, and then compile all errors and warnings in one summary.
118
+ </p>
119
+
120
+ <h4 id="how_can_i_specify_the_exact_urls_i_want_to_validate">How can I specify the exact URLs I want to validate?</h4>
121
+ <p>
122
+ The W3CLove spider will crawl the provided URL in search for internal links, but you can also provide an <a href="http://www.sitemaps.org/">XML sitemap</a> with the exact URLs
123
+ you need to validate.
124
+ </p>
125
+
126
+ <h4 id="is_there_a_limit_on_the_number_of_urls">Is there a limit on the number of URLs to validate?</h4>
127
+ <p>
128
+ Yes, there is a limit of 250 URLs per each sitemap submitted. This should be enough for most sites to get a good idea of the
129
+ validation status of the site, and saves processing time for both W3CLove and the W3C validator.
130
+ </p>
131
+
132
+ <h4 id="how_can_i_resubmit_a_site">How can I resubmit a site?</h4>
133
+ <p>
134
+ Just click on the "Re-check" buttons. You can recheck the whole sitemap or individual pages.
135
+ </p>
136
+
137
+ <h4 id="can_i_store_my_sitemaps_list">Can I store my sitemaps list?</h4>
138
+ <p>
139
+ Yes, W3CLove lets you store for free a list of the sitemaps you're interested in validating. Just sign in with your Twitter, Facebook or Google account and every sitemap you validate will appear on your sitemaps list.
140
+ </p>
141
+
142
+ <h4 id="how_do_credits_work">How do credits work?</h4>
143
+ <p>
144
+ When you sign up for the first time at W3CLove, you're given 100 initial credits so you can try the service for free.
145
+ </p>
146
+ <p>
147
+ For every single web page validation that you make using our service, you're charged 1 credit. So, for example, if you start with 100 credits and you validate a site that has 30 web pages, you end up with 70 credits.
148
+ </p>
149
+ <p>
150
+ Once you spend all your credits, you can't make more validations until you recharge them.
151
+ </p>
152
+
153
+ <h4 id="how_can_i_recharge_my_credits">How can I recharge my credits?</h4>
154
+ <p>
155
+ The easiest way to recharge your credits is through a monthly subscription.
156
+ </p>
157
+ <p>
158
+ This way, your credits will be recharged every month up to the monthly limit of your chosen plan.
159
+ </p>
160
+ <p>
161
+ Check out the <a href="/plans-and-pricing">Plans and pricing</a> page to see what plan is best for you. If you're not sure about how many validations you need, you can buy packs of validations.
162
+ </p>
163
+
164
+ <h4 id="how_can_i_sign_in_with_another_account">I've signed in from one account, how can I sign in with a different one?</h4>
165
+ <p>
166
+ For your convenience, you're first shown 3 ways to sign in: Twitter, Facebook and Google. When you use one of those, W3CLove will remember your preference and offer just this one.
167
+ </p>
168
+ <p>
169
+ If you'd like to change this preference, just sign in again with your preferred account:<br/>
170
+ <a href='#' onclick='window.location="/auth/twitter"; return false;' style='color:white;'><span class='label success'>twitter</span></a>, <a href='#' onclick='window.location="/auth/facebook"; return false;' style='color:white;'><span class='label success'>facebook</span></a> or <a href='#' onclick='window.location="/auth/google_oauth2"; return false;' style='color:white;'><span class='label success'>google</span></a>.
171
+ </p>
172
+
173
+ <h4 id="who_is_behind_all_this">Who is behind all this?</h4>
174
+ <p>
175
+ W3CLove is a personal project maintained by <a href="http://jaimeiniesta.com/">Jaime Iniesta</a>, an independent web developer who loves working with Ruby on Rails. That's me. :)
176
+ </p>
177
+
178
+ <h4 id="how_did_this_project_start">How did this project start?</h4>
179
+ <p>
180
+ During March 2011 I took the Ruby Core Skills course at the <a href="http://mendicantuniversity.org/">Mendicant University</a>, an intense three week course that takes you through several important topic areas every Ruby developer should be comfortable on. You can <a href="http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby">read more</a> about it at my blog.
181
+ </p>
182
+ <p>
183
+ With the help of <a href="http://majesticseacreature.com/">Gregory Brown</a> and the rest of the <a href="http://school.mendicantuniversity.org/alumni/2011">Mendicant University Alumni</a>, I built the <a href="https://github.com/jaimeiniesta/w3clove">w3clove</a> gem that allows you to do site-wide markup validation from the command line.
184
+ </p>
185
+ <p>
186
+ Afterwards, I built this <a href="http://w3clove.com">W3CLove.com</a> site to make it easier for everyone to do site-wide markup validation, with a nicer HTML interface, storing the results for later, rechecking, etc.
187
+ </p>
188
+ <p>
189
+ I want to express my gratitude to all the Mendicant University community, all of them are still helping me making W3CLove a better tool for everyone. Thank you!
190
+ </p>
191
+
192
+ <h4 id="is_this_free">Is this free?</h4>
193
+ <p>
194
+ No, this is a paid service, but you can try it for free.
195
+ </p>
196
+
197
+ <h4 id="is_there_an_open_source_version">Is there an open source version?</h4>
198
+ <p>
199
+ Yes! There's a free, standalone version that you can install on your computer. It's packed as a Ruby gem and it's open source, so you can examine the code and contribute to it if you wish.
200
+ </p>
201
+ <p>
202
+ You can find the <a href="https://github.com/jaimeiniesta/w3clove">w3clove gem at Github</a>.
203
+ </p>
204
+
205
+ <h4 id="is_there_an_api">Is there an API?</h4>
206
+ <p>
207
+ Yes! I've started building an API. It's not finished yet, but you can already validate sitemaps and pages with it. Read more about it at the <a href="/api_v1_reference">API V1 Reference</a> page.
208
+ </p>
209
+
210
+ </div>
211
+ </div>
212
+
213
+
214
+ <div class="row footer">
215
+ <div class="content">
216
+ <footer>
217
+ <p>
218
+ <!-- Begin MailChimp Signup Form -->
219
+ <div id="mc_embed_signup">
220
+ <form action="http://w3clove.us4.list-manage.com/subscribe/post?u=6af3ab69c286561d0f0f25671&amp;id=04a0dab609" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate shadowins">
221
+ <label for="mce-EMAIL">Subscribe to our newsletter:</label>
222
+ <input type="email" value="your email" name="EMAIL" class="email" id="mce-EMAIL" placeholder="your email" onfocus="this.value='';" required>
223
+ <div class="clear"><input type="submit" value="♥" name="subscribe" id="mc-embedded-subscribe" class="button btn"></div>
224
+ </form>
225
+ </div>
226
+ <!--End mc_embed_signup-->
227
+
228
+ <ul class="social_share">
229
+ <li class="twitter_follow"><a href="https://twitter.com/w3clove" class="twitter-follow-button" data-button="grey" data-text-color="#FFFFFF" data-link-color="#999999" data-show-count="false">Follow</a></li>
230
+ <li class="tweets_count"><a href="http://twitter.com/share" style="display:block;" class="twitter-share-button" data-count="horizontal" data-via="w3clove" data-lang="en">Tweet</a><script type="text/javascript" src="http://platform.twitter.com/widgets.js"></script></li>
231
+ <li class="gplus_count"><div class="g-plusone" data-size="medium" data-count="true"></div></li>
232
+ </ul>
233
+ </p>
234
+ <p class="clearb"><strong>W3CLove</strong> lets you <strong>validate entire sites</strong> with one click. This is an independent project, not associated with the W3C. By making use of this website you agree to the <a href="/terms_of_service">Terms of service</a>.<br /><br />Follow us on <a href="http://twitter.com/W3CLove">Twitter</a> and <a href="http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609">subscribe to our monthly newsletter</a>.</p>
235
+ </footer>
236
+ </div>
237
+ </div>
238
+
239
+ <script type="text/javascript">
240
+ var uvOptions = {};
241
+ (function() {
242
+ var uv = document.createElement('script'); uv.type = 'text/javascript'; uv.async = true;
243
+ uv.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + 'widget.uservoice.com/nhy6YD24GjgADgFX3h5z4w.js';
244
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(uv, s);
245
+ })();
246
+ </script>
247
+ <script type="text/javascript">
248
+ //<![CDATA[
249
+
250
+ var menuVisible = false;
251
+ $('.nav_button').on('click', showHideMenu);
252
+
253
+ //]]>
254
+ </script>
255
+
256
+ <script type="text/javascript">if (!NREUMQ.f) { NREUMQ.f=function() {
257
+ NREUMQ.push(["load",new Date().getTime()]);
258
+ var e=document.createElement("script");
259
+ e.type="text/javascript";e.async=true;e.src="https://d1ros97qkrwjf5.cloudfront.net/39/eum/rum.js";
260
+ document.body.appendChild(e);
261
+ if(NREUMQ.a)NREUMQ.a();
262
+ };
263
+ NREUMQ.a=window.onload;window.onload=NREUMQ.f;
264
+ };
265
+ NREUMQ.push(["nrfj","beacon-1.newrelic.com","96fc3f1db6",415027,"c1hbQUcNWlhQQhsNWVdfakNaDkJVUlUbFVFXUkYaRgpZQw==",0.0,14,new Date().getTime(),"","","","",""])</script></body>
266
+ </html>
@@ -14,6 +14,7 @@ describe MetaInspector do
14
14
  FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
15
15
  FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
16
16
  FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
17
+ FakeWeb.register_uri(:get, "http://w3clove.com/faqs", :response => fixture_file("w3clove_faqs.response"))
17
18
 
18
19
  describe 'Initialization' do
19
20
  it 'should accept an URL with a scheme' do
@@ -30,6 +31,16 @@ describe MetaInspector do
30
31
  MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
31
32
  MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
32
33
  end
34
+
35
+ it "should store the host" do
36
+ MetaInspector.new('http://pagerankalert.com').host.should == 'pagerankalert.com'
37
+ MetaInspector.new('https://pagerankalert.com').host.should == 'pagerankalert.com'
38
+ end
39
+
40
+ it "should store the root url" do
41
+ MetaInspector.new('http://pagerankalert.com').root_url.should == 'http://pagerankalert.com/'
42
+ MetaInspector.new('https://pagerankalert.com').root_url.should == 'https://pagerankalert.com/'
43
+ end
33
44
  end
34
45
 
35
46
  describe 'Doing a basic scrape' do
@@ -54,8 +65,7 @@ describe MetaInspector do
54
65
  end
55
66
 
56
67
  it "should find all page images" do
57
- @m.absolute_images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
58
- @m.images == ["/images/pagerank_alert.png?1309512337"]
68
+ @m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
59
69
  end
60
70
 
61
71
  it "should ignore malformed image tags" do
@@ -97,29 +107,41 @@ describe MetaInspector do
97
107
  end
98
108
 
99
109
  it "should get the links" do
100
- @m.links.should == [
101
- "/",
102
- "/es?language=es",
103
- "/users/sign_up",
104
- "/users/sign_in",
105
- "mailto:pagerankalert@gmail.com",
106
- "http://pagerankalert.posterous.com",
107
- "http://twitter.com/pagerankalert",
108
- "http://twitter.com/share"
109
- ]
110
- end
111
-
112
- it "should convert links to absolute urls" do
113
- @m.absolute_links.should == [
114
- "http://pagerankalert.com/",
115
- "http://pagerankalert.com/es?language=es",
116
- "http://pagerankalert.com/users/sign_up",
117
- "http://pagerankalert.com/users/sign_in",
118
- "mailto:pagerankalert@gmail.com",
119
- "http://pagerankalert.posterous.com",
120
- "http://twitter.com/pagerankalert",
121
- "http://twitter.com/share"
122
- ]
110
+ @m.links.should == [ "http://pagerankalert.com/",
111
+ "http://pagerankalert.com/es?language=es",
112
+ "http://pagerankalert.com/users/sign_up",
113
+ "http://pagerankalert.com/users/sign_in",
114
+ "mailto:pagerankalert@gmail.com",
115
+ "http://pagerankalert.posterous.com",
116
+ "http://twitter.com/pagerankalert",
117
+ "http://twitter.com/share" ]
118
+ end
119
+
120
+ it "should get correct absolute links for internal pages" do
121
+ m = MetaInspector.new('http://w3clove.com/faqs')
122
+ m.links.should == [ "http://w3clove.com/faqs/#",
123
+ "http://w3clove.com/",
124
+ "http://w3clove.com/faqs",
125
+ "http://w3clove.com/plans-and-pricing",
126
+ "http://w3clove.com/contact",
127
+ "http://w3clove.com/charts/errors",
128
+ "http://w3clove.com/credits",
129
+ "http://w3clove.com/signin",
130
+ "http://validator.w3.org",
131
+ "http://www.sitemaps.org/",
132
+ "http://jaimeiniesta.com/",
133
+ "http://mendicantuniversity.org/",
134
+ "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
135
+ "http://majesticseacreature.com/",
136
+ "http://school.mendicantuniversity.org/alumni/2011",
137
+ "https://github.com/jaimeiniesta/w3clove",
138
+ "http://w3clove.com",
139
+ "http://w3clove.com/api_v1_reference",
140
+ "https://twitter.com/w3clove",
141
+ "http://twitter.com/share",
142
+ "http://w3clove.com/terms_of_service",
143
+ "http://twitter.com/W3CLove",
144
+ "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
123
145
  end
124
146
  end
125
147
 
@@ -138,10 +160,6 @@ describe MetaInspector do
138
160
  "telnet://telnet.cdrom.com"
139
161
  ]
140
162
  end
141
-
142
- it "should return the same links as absolute links do" do
143
- @m.absolute_links.should == @m.links
144
- end
145
163
  end
146
164
 
147
165
  describe 'Protocol-relative URLs' do
@@ -151,13 +169,13 @@ describe MetaInspector do
151
169
  end
152
170
 
153
171
  it "should convert protocol-relative links to http" do
154
- @m_http.absolute_links.should include('http://protocol-relative.com/contact')
155
- @m_http.absolute_links.should include('http://yahoo.com')
172
+ @m_http.links.should include('http://protocol-relative.com/contact')
173
+ @m_http.links.should include('http://yahoo.com')
156
174
  end
157
175
 
158
176
  it "should convert protocol-relative links to https" do
159
- @m_https.absolute_links.should include('https://protocol-relative.com/contact')
160
- @m_https.absolute_links.should include('https://yahoo.com')
177
+ @m_https.links.should include('https://protocol-relative.com/contact')
178
+ @m_https.links.should include('https://yahoo.com')
161
179
  end
162
180
  end
163
181
 
@@ -227,7 +245,7 @@ describe MetaInspector do
227
245
  describe 'to_hash' do
228
246
  it "should return a hash with all the values set" do
229
247
  @m = MetaInspector.new('http://pagerankalert.com')
230
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
248
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
231
249
  end
232
250
  end
233
251
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 53
4
+ hash: 59
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 9
9
- - 3
10
- version: 1.9.3
9
+ - 4
10
+ version: 1.9.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-07-22 00:00:00 Z
18
+ date: 2012-07-23 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -162,6 +162,7 @@ files:
162
162
  - spec/fixtures/tea-tron.com.response
163
163
  - spec/fixtures/theonion-no-description.com.response
164
164
  - spec/fixtures/theonion.com.response
165
+ - spec/fixtures/w3clove_faqs.response
165
166
  - spec/fixtures/youtube.response
166
167
  - spec/metainspector_spec.rb
167
168
  - spec/spec_helper.rb