RubyGems - metainspector - Versions diffs - 4.2.0 → 4.2.1 - Mend

metainspector 4.2.0 → 4.2.1

Files changed (14) hide show

checksums.yaml +4 -4
data/README.md +7 -0
data/lib/meta_inspector/document.rb +2 -1
data/lib/meta_inspector/parser.rb +1 -1
data/lib/meta_inspector/parsers/texts.rb +18 -0
data/lib/meta_inspector/version.rb +1 -1
data/spec/fixtures/title_best_choice.response +42 -0
data/spec/fixtures/title_in_body.response +22 -0
data/spec/fixtures/title_in_h1.response +24 -0
data/spec/fixtures/title_in_head.response +23 -0
data/spec/fixtures/title_in_head_with_whitespace.response +26 -0
data/spec/meta_inspector/texts_spec.rb +33 -1
data/spec/spec_helper.rb +7 -0
metadata +7 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: bbcf96088ef49b859442dfd0244bda8b7e4870fb
-  data.tar.gz: 3f87e155f4e1d260f6eff96867b458278a8a450b
+  metadata.gz: bd4bfdcaa225ae095a22dc7d853daa2046e5c764
+  data.tar.gz: 6d7588c9732ea0a4f2e6512c92aff721874197e6
 SHA512:
-  metadata.gz: 005a2f07c88b2ca40bcf970ef7eabe2eb0b40d76bb1556a67a0bfbbcc170ab6be1836942dd841d97cd9a110fb25dcd8cd6e3aeb3f164cd2f3dcc020bb7708d27
-  data.tar.gz: 26ef69520abd2564e431a22dd3e6d139263e7163a250478c8a70dd41d3fadbfb22a3828757d645192d763d7ea0e2c5298b53d874a3f999b03ff1d68451ad422b
+  metadata.gz: 85974aa0874dfd9e2f90c97416ceac9c9512ce4919cc8f90a2e74078d388d5e0d214db16ef13958975d110760ef88ff861cd16d0e8a654bcc00df142fa2616b9
+  data.tar.gz: 0274b0b000bc61fd88f08090c2ede9bed40055f7f351c522991e73289585a1948fedd4bd158a1cf07749cd6d31c8dccaac051903a684f7c341a3004471c830a8

data/README.md CHANGED Viewed

@@ -8,6 +8,13 @@ You give it an URL, and it lets you easily get its title, links, images, charset
 You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
+## Changes in 4.2.1
+* The Document API has been extended with one new method:
+  * `page.best_title` returns the longest text available from a selection of candidates.
 ## Changes in 4.2.0
 * The images API has been extended, with two new methods:

data/lib/meta_inspector/document.rb CHANGED Viewed

@@ -48,7 +48,8 @@ module MetaInspector
     delegate [:content_type, :response]               => :@request
-    delegate [:parsed, :title, :description, :links,
+    delegate [:parsed, :title, :best_title,
+              :description, :links,
               :images, :feed, :charset, :meta_tags,
               :meta_tag, :meta, :favicon]             => :@parser

data/lib/meta_inspector/parser.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module MetaInspector
     delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
     delegate [:links, :feed, :base_url]               => :@links_parser
     delegate :images                                  => :@images_parser
-    delegate [:title, :description]                   => :@texts_parser
+    delegate [:title, :best_title, :description]      => :@texts_parser
     # Returns the whole parsed document
     def parsed

data/lib/meta_inspector/parsers/texts.rb CHANGED Viewed

@@ -9,6 +9,24 @@ module MetaInspector
         @title ||= parsed.css('head title').inner_text rescue nil
       end
+      def best_title
+        @best_title ||= begin
+          candidates = [
+              parsed.css('head title'),
+              parsed.css('body title'),
+              meta['og:title'],
+              parsed.css('h1').first
+          ]
+          candidates.flatten!
+          candidates.map! { |c| (c.respond_to? :inner_text) ? c.inner_text : c }
+          candidates.compact!
+          candidates.map! { |c| c.gsub(/\s+/, ' ') }
+          candidates.uniq!
+          candidates.sort_by! { |t| -t.length }
+          candidates.first.strip
+        end
+      end
       # A description getter that first checks for a meta description
       # and if not present will guess by looking at the first paragraph
       # with more than 120 characters

data/lib/meta_inspector/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MetaInspector
-  VERSION = "4.2.0"
+  VERSION = "4.2.1"
 end

data/spec/fixtures/title_best_choice.response ADDED Viewed

@@ -0,0 +1,42 @@
+HTTP/1.1 200 OK
+Age: 13
+Cache-Control: max-age=120
+Content-Type: text/html
+Date: Mon, 06 Jan 2014 12:47:42 GMT
+Expires: Mon, 06 Jan 2014 12:49:28 GMT
+Server: Apache/2.2.14 (Ubuntu)
+Vary: Accept-Encoding
+Via: 1.1 varnish
+X-Powered-By: PHP/5.3.2-1ubuntu4.22
+X-Varnish: 1188792404 1188790413
+Content-Length: 695
+Connection: keep-alive
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
+  <head>
+    <title>This title
+    is in
+    the head
+    and has blank lines in it, making it artificially long
+    </title>
+    <meta property="og:title" content="This OG title is long, but not long enough" />
+  </head>
+  <body>
+    <title>This title is short</title>
+    <h1>This title came from the first h1 and should be the longest of them all, so should be chosen</h1>
+    <h1>This came from the second h1 and should be ignored</h1>
+    <h1>This came from the third h1 and should also be ignored</h1>
+    <p>A sample page with many types of meta tags</p>
+  </body>
+</html>

data/spec/fixtures/title_in_body.response ADDED Viewed

@@ -0,0 +1,22 @@
+HTTP/1.1 200 OK
+Age: 13
+Cache-Control: max-age=120
+Content-Type: text/html
+Date: Mon, 06 Jan 2014 12:47:42 GMT
+Expires: Mon, 06 Jan 2014 12:49:28 GMT
+Server: Apache/2.2.14 (Ubuntu)
+Vary: Accept-Encoding
+Via: 1.1 varnish
+X-Powered-By: PHP/5.3.2-1ubuntu4.22
+X-Varnish: 1188792404 1188790413
+Content-Length: 265
+Connection: keep-alive
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
+  <head></head>
+  <body>
+    <title>This title came from the body, not the head</title>
+    <p>A sample page with many types of meta tags</p>
+  </body>
+</html>

data/spec/fixtures/title_in_h1.response ADDED Viewed

@@ -0,0 +1,24 @@
+HTTP/1.1 200 OK
+Age: 13
+Cache-Control: max-age=120
+Content-Type: text/html
+Date: Mon, 06 Jan 2014 12:47:42 GMT
+Expires: Mon, 06 Jan 2014 12:49:28 GMT
+Server: Apache/2.2.14 (Ubuntu)
+Vary: Accept-Encoding
+Via: 1.1 varnish
+X-Powered-By: PHP/5.3.2-1ubuntu4.22
+X-Varnish: 1188792404 1188790413
+Content-Length: 381
+Connection: keep-alive
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
+  <head></head>
+  <body>
+    <h1>This title came from the first h1</h1>
+    <h1>This came from the second h1 and should be ignored</h1>
+    <h1>This came from the third h1 and should also be ignored</h1>
+    <p>A sample page with many types of meta tags</p>
+  </body>
+</html>

data/spec/fixtures/title_in_head.response ADDED Viewed

@@ -0,0 +1,23 @@
+HTTP/1.1 200 OK
+Age: 13
+Cache-Control: max-age=120
+Content-Type: text/html
+Date: Mon, 06 Jan 2014 12:47:42 GMT
+Expires: Mon, 06 Jan 2014 12:49:28 GMT
+Server: Apache/2.2.14 (Ubuntu)
+Vary: Accept-Encoding
+Via: 1.1 varnish
+X-Powered-By: PHP/5.3.2-1ubuntu4.22
+X-Varnish: 1188792404 1188790413
+Content-Length: 265
+Connection: keep-alive
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
+  <head>
+    <title>This title came from the head</title>
+  </head>
+  <body>
+    <p>A sample page with many types of meta tags</p>
+  </body>
+</html>

data/spec/fixtures/title_in_head_with_whitespace.response ADDED Viewed

@@ -0,0 +1,26 @@
+HTTP/1.1 200 OK
+Age: 13
+Cache-Control: max-age=120
+Content-Type: text/html
+Date: Mon, 06 Jan 2014 12:47:42 GMT
+Expires: Mon, 06 Jan 2014 12:49:28 GMT
+Server: Apache/2.2.14 (Ubuntu)
+Vary: Accept-Encoding
+Via: 1.1 varnish
+X-Powered-By: PHP/5.3.2-1ubuntu4.22
+X-Varnish: 1188792404 1188790413
+Content-Length: 265
+Connection: keep-alive
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
+  <head>
+    <title>       This title came from the head and has leading and trailing whitespace
+    </title>
+  </head>
+  <body>
+    <p>A sample page with many types of meta tags</p>
+  </body>
+</html>

data/spec/meta_inspector/texts_spec.rb CHANGED Viewed

@@ -3,10 +3,42 @@ require 'spec_helper'
 describe MetaInspector do
   it "should get the title from the head section" do
     page = MetaInspector.new('http://example.com')
     page.title.should == 'An example page'
   end
+  describe '#best_title' do
+    it "should find 'head title' when that's the only thing" do
+      page = MetaInspector.new('http://example.com/title_in_head')
+      page.best_title.should == 'This title came from the head'
+    end
+    it "should find 'body title' when that's the only thing" do
+      page = MetaInspector.new('http://example.com/title_in_body')
+      page.best_title.should == 'This title came from the body, not the head'
+    end
+    it "should find 'og:title' when that's the only thing" do
+      page = MetaInspector.new('http://example.com/meta-tags')
+      page.best_title.should == 'An OG title'
+    end
+    it "should find the first <h1> when that's the only thing" do
+      page = MetaInspector.new('http://example.com/title_in_h1')
+      page.best_title.should == 'This title came from the first h1'
+    end
+    it "should choose the longest candidate from the available options" do
+      page = MetaInspector.new('http://example.com/title_best_choice')
+      page.best_title.should == 'This title came from the first h1 and should be the longest of them all, so should be chosen'
+    end
+    it "should strip leading and trailing whitespace and all line breaks" do
+      page = MetaInspector.new('http://example.com/title_in_head_with_whitespace')
+      page.best_title.should == 'This title came from the head and has leading and trailing whitespace'
+    end
+  end
   describe '#description' do
     it "should find description from meta description" do
       page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')

data/spec/spec_helper.rb CHANGED Viewed

@@ -37,6 +37,13 @@ FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size",
 FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
 FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
+# Used to test best_title logic
+FakeWeb.register_uri(:get, "http://example.com/title_in_head", :response => fixture_file("title_in_head.response"))
+FakeWeb.register_uri(:get, "http://example.com/title_in_body", :response => fixture_file("title_in_body.response"))
+FakeWeb.register_uri(:get, "http://example.com/title_in_h1", :response => fixture_file("title_in_h1.response"))
+FakeWeb.register_uri(:get, "http://example.com/title_best_choice", :response => fixture_file("title_best_choice.response"))
+FakeWeb.register_uri(:get, "http://example.com/title_in_head_with_whitespace", :response => fixture_file("title_in_head_with_whitespace.response"))
 # These are older fixtures
 FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
 FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  version: 4.2.0
+  version: 4.2.1
 platform: ruby
 authors:
 - Jaime Iniesta
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-20 00:00:00.000000000 Z
+date: 2015-01-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -289,6 +289,11 @@ files:
 - spec/fixtures/tea-tron.com.response
 - spec/fixtures/theonion-no-description.com.response
 - spec/fixtures/theonion.com.response
+- spec/fixtures/title_best_choice.response
+- spec/fixtures/title_in_body.response
+- spec/fixtures/title_in_h1.response
+- spec/fixtures/title_in_head.response
+- spec/fixtures/title_in_head_with_whitespace.response
 - spec/fixtures/twitter_markupvalidator.response
 - spec/fixtures/unsafe_facebook.com.response
 - spec/fixtures/unsafe_https.facebook.com.response