metainspector 4.2.0 → 4.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbcf96088ef49b859442dfd0244bda8b7e4870fb
4
- data.tar.gz: 3f87e155f4e1d260f6eff96867b458278a8a450b
3
+ metadata.gz: bd4bfdcaa225ae095a22dc7d853daa2046e5c764
4
+ data.tar.gz: 6d7588c9732ea0a4f2e6512c92aff721874197e6
5
5
  SHA512:
6
- metadata.gz: 005a2f07c88b2ca40bcf970ef7eabe2eb0b40d76bb1556a67a0bfbbcc170ab6be1836942dd841d97cd9a110fb25dcd8cd6e3aeb3f164cd2f3dcc020bb7708d27
7
- data.tar.gz: 26ef69520abd2564e431a22dd3e6d139263e7163a250478c8a70dd41d3fadbfb22a3828757d645192d763d7ea0e2c5298b53d874a3f999b03ff1d68451ad422b
6
+ metadata.gz: 85974aa0874dfd9e2f90c97416ceac9c9512ce4919cc8f90a2e74078d388d5e0d214db16ef13958975d110760ef88ff861cd16d0e8a654bcc00df142fa2616b9
7
+ data.tar.gz: 0274b0b000bc61fd88f08090c2ede9bed40055f7f351c522991e73289585a1948fedd4bd158a1cf07749cd6d31c8dccaac051903a684f7c341a3004471c830a8
data/README.md CHANGED
@@ -8,6 +8,13 @@ You give it an URL, and it lets you easily get its title, links, images, charset
8
8
 
9
9
  You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
10
10
 
11
+ ## Changes in 4.2.1
12
+
13
+ * The Document API has been extended with one new method:
14
+
15
+ * `page.best_title` returns the longest text available from a selection of candidates.
16
+
17
+
11
18
  ## Changes in 4.2.0
12
19
 
13
20
  * The images API has been extended, with two new methods:
@@ -48,7 +48,8 @@ module MetaInspector
48
48
 
49
49
  delegate [:content_type, :response] => :@request
50
50
 
51
- delegate [:parsed, :title, :description, :links,
51
+ delegate [:parsed, :title, :best_title,
52
+ :description, :links,
52
53
  :images, :feed, :charset, :meta_tags,
53
54
  :meta_tag, :meta, :favicon] => :@parser
54
55
 
@@ -25,7 +25,7 @@ module MetaInspector
25
25
  delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
26
26
  delegate [:links, :feed, :base_url] => :@links_parser
27
27
  delegate :images => :@images_parser
28
- delegate [:title, :description] => :@texts_parser
28
+ delegate [:title, :best_title, :description] => :@texts_parser
29
29
 
30
30
  # Returns the whole parsed document
31
31
  def parsed
@@ -9,6 +9,24 @@ module MetaInspector
9
9
  @title ||= parsed.css('head title').inner_text rescue nil
10
10
  end
11
11
 
12
+ def best_title
13
+ @best_title ||= begin
14
+ candidates = [
15
+ parsed.css('head title'),
16
+ parsed.css('body title'),
17
+ meta['og:title'],
18
+ parsed.css('h1').first
19
+ ]
20
+ candidates.flatten!
21
+ candidates.map! { |c| (c.respond_to? :inner_text) ? c.inner_text : c }
22
+ candidates.compact!
23
+ candidates.map! { |c| c.gsub(/\s+/, ' ') }
24
+ candidates.uniq!
25
+ candidates.sort_by! { |t| -t.length }
26
+ candidates.first.strip
27
+ end
28
+ end
29
+
12
30
  # A description getter that first checks for a meta description
13
31
  # and if not present will guess by looking at the first paragraph
14
32
  # with more than 120 characters
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = "4.2.0"
2
+ VERSION = "4.2.1"
3
3
  end
@@ -0,0 +1,42 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 695
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title>This title
19
+
20
+ is in
21
+
22
+
23
+ the head
24
+
25
+ and has blank lines in it, making it artificially long
26
+
27
+
28
+ </title>
29
+
30
+ <meta property="og:title" content="This OG title is long, but not long enough" />
31
+
32
+
33
+
34
+ </head>
35
+ <body>
36
+ <title>This title is short</title>
37
+ <h1>This title came from the first h1 and should be the longest of them all, so should be chosen</h1>
38
+ <h1>This came from the second h1 and should be ignored</h1>
39
+ <h1>This came from the third h1 and should also be ignored</h1>
40
+ <p>A sample page with many types of meta tags</p>
41
+ </body>
42
+ </html>
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head></head>
18
+ <body>
19
+ <title>This title came from the body, not the head</title>
20
+ <p>A sample page with many types of meta tags</p>
21
+ </body>
22
+ </html>
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 381
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head></head>
18
+ <body>
19
+ <h1>This title came from the first h1</h1>
20
+ <h1>This came from the second h1 and should be ignored</h1>
21
+ <h1>This came from the third h1 and should also be ignored</h1>
22
+ <p>A sample page with many types of meta tags</p>
23
+ </body>
24
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title>This title came from the head</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample page with many types of meta tags</p>
22
+ </body>
23
+ </html>
@@ -0,0 +1,26 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title> This title came from the head and has leading and trailing whitespace
19
+
20
+
21
+ </title>
22
+ </head>
23
+ <body>
24
+ <p>A sample page with many types of meta tags</p>
25
+ </body>
26
+ </html>
@@ -3,10 +3,42 @@ require 'spec_helper'
3
3
  describe MetaInspector do
4
4
  it "should get the title from the head section" do
5
5
  page = MetaInspector.new('http://example.com')
6
-
7
6
  page.title.should == 'An example page'
8
7
  end
9
8
 
9
+ describe '#best_title' do
10
+ it "should find 'head title' when that's the only thing" do
11
+ page = MetaInspector.new('http://example.com/title_in_head')
12
+ page.best_title.should == 'This title came from the head'
13
+ end
14
+
15
+ it "should find 'body title' when that's the only thing" do
16
+ page = MetaInspector.new('http://example.com/title_in_body')
17
+ page.best_title.should == 'This title came from the body, not the head'
18
+ end
19
+
20
+ it "should find 'og:title' when that's the only thing" do
21
+ page = MetaInspector.new('http://example.com/meta-tags')
22
+ page.best_title.should == 'An OG title'
23
+ end
24
+
25
+ it "should find the first <h1> when that's the only thing" do
26
+ page = MetaInspector.new('http://example.com/title_in_h1')
27
+ page.best_title.should == 'This title came from the first h1'
28
+ end
29
+
30
+ it "should choose the longest candidate from the available options" do
31
+ page = MetaInspector.new('http://example.com/title_best_choice')
32
+ page.best_title.should == 'This title came from the first h1 and should be the longest of them all, so should be chosen'
33
+ end
34
+
35
+ it "should strip leading and trailing whitespace and all line breaks" do
36
+ page = MetaInspector.new('http://example.com/title_in_head_with_whitespace')
37
+ page.best_title.should == 'This title came from the head and has leading and trailing whitespace'
38
+ end
39
+
40
+ end
41
+
10
42
  describe '#description' do
11
43
  it "should find description from meta description" do
12
44
  page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
data/spec/spec_helper.rb CHANGED
@@ -37,6 +37,13 @@ FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size",
37
37
  FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
38
38
  FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
39
39
 
40
+ # Used to test best_title logic
41
+ FakeWeb.register_uri(:get, "http://example.com/title_in_head", :response => fixture_file("title_in_head.response"))
42
+ FakeWeb.register_uri(:get, "http://example.com/title_in_body", :response => fixture_file("title_in_body.response"))
43
+ FakeWeb.register_uri(:get, "http://example.com/title_in_h1", :response => fixture_file("title_in_h1.response"))
44
+ FakeWeb.register_uri(:get, "http://example.com/title_best_choice", :response => fixture_file("title_best_choice.response"))
45
+ FakeWeb.register_uri(:get, "http://example.com/title_in_head_with_whitespace", :response => fixture_file("title_in_head_with_whitespace.response"))
46
+
40
47
  # These are older fixtures
41
48
  FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
42
49
  FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.0
4
+ version: 4.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-20 00:00:00.000000000 Z
11
+ date: 2015-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -289,6 +289,11 @@ files:
289
289
  - spec/fixtures/tea-tron.com.response
290
290
  - spec/fixtures/theonion-no-description.com.response
291
291
  - spec/fixtures/theonion.com.response
292
+ - spec/fixtures/title_best_choice.response
293
+ - spec/fixtures/title_in_body.response
294
+ - spec/fixtures/title_in_h1.response
295
+ - spec/fixtures/title_in_head.response
296
+ - spec/fixtures/title_in_head_with_whitespace.response
292
297
  - spec/fixtures/twitter_markupvalidator.response
293
298
  - spec/fixtures/unsafe_facebook.com.response
294
299
  - spec/fixtures/unsafe_https.facebook.com.response