metainspector 4.2.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbcf96088ef49b859442dfd0244bda8b7e4870fb
4
- data.tar.gz: 3f87e155f4e1d260f6eff96867b458278a8a450b
3
+ metadata.gz: bd4bfdcaa225ae095a22dc7d853daa2046e5c764
4
+ data.tar.gz: 6d7588c9732ea0a4f2e6512c92aff721874197e6
5
5
  SHA512:
6
- metadata.gz: 005a2f07c88b2ca40bcf970ef7eabe2eb0b40d76bb1556a67a0bfbbcc170ab6be1836942dd841d97cd9a110fb25dcd8cd6e3aeb3f164cd2f3dcc020bb7708d27
7
- data.tar.gz: 26ef69520abd2564e431a22dd3e6d139263e7163a250478c8a70dd41d3fadbfb22a3828757d645192d763d7ea0e2c5298b53d874a3f999b03ff1d68451ad422b
6
+ metadata.gz: 85974aa0874dfd9e2f90c97416ceac9c9512ce4919cc8f90a2e74078d388d5e0d214db16ef13958975d110760ef88ff861cd16d0e8a654bcc00df142fa2616b9
7
+ data.tar.gz: 0274b0b000bc61fd88f08090c2ede9bed40055f7f351c522991e73289585a1948fedd4bd158a1cf07749cd6d31c8dccaac051903a684f7c341a3004471c830a8
data/README.md CHANGED
@@ -8,6 +8,13 @@ You give it an URL, and it lets you easily get its title, links, images, charset
8
8
 
9
9
  You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
10
10
 
11
+ ## Changes in 4.2.1
12
+
13
+ * The Document API has been extended with one new method:
14
+
15
+ * `page.best_title` returns the longest text available from a selection of candidates.
16
+
17
+
11
18
  ## Changes in 4.2.0
12
19
 
13
20
  * The images API has been extended, with two new methods:
@@ -48,7 +48,8 @@ module MetaInspector
48
48
 
49
49
  delegate [:content_type, :response] => :@request
50
50
 
51
- delegate [:parsed, :title, :description, :links,
51
+ delegate [:parsed, :title, :best_title,
52
+ :description, :links,
52
53
  :images, :feed, :charset, :meta_tags,
53
54
  :meta_tag, :meta, :favicon] => :@parser
54
55
 
@@ -25,7 +25,7 @@ module MetaInspector
25
25
  delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
26
26
  delegate [:links, :feed, :base_url] => :@links_parser
27
27
  delegate :images => :@images_parser
28
- delegate [:title, :description] => :@texts_parser
28
+ delegate [:title, :best_title, :description] => :@texts_parser
29
29
 
30
30
  # Returns the whole parsed document
31
31
  def parsed
@@ -9,6 +9,24 @@ module MetaInspector
9
9
  @title ||= parsed.css('head title').inner_text rescue nil
10
10
  end
11
11
 
12
+ def best_title
13
+ @best_title ||= begin
14
+ candidates = [
15
+ parsed.css('head title'),
16
+ parsed.css('body title'),
17
+ meta['og:title'],
18
+ parsed.css('h1').first
19
+ ]
20
+ candidates.flatten!
21
+ candidates.map! { |c| (c.respond_to? :inner_text) ? c.inner_text : c }
22
+ candidates.compact!
23
+ candidates.map! { |c| c.gsub(/\s+/, ' ') }
24
+ candidates.uniq!
25
+ candidates.sort_by! { |t| -t.length }
26
+ candidates.first.strip
27
+ end
28
+ end
29
+
12
30
  # A description getter that first checks for a meta description
13
31
  # and if not present will guess by looking at the first paragraph
14
32
  # with more than 120 characters
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = "4.2.0"
2
+ VERSION = "4.2.1"
3
3
  end
@@ -0,0 +1,42 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 695
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title>This title
19
+
20
+ is in
21
+
22
+
23
+ the head
24
+
25
+ and has blank lines in it, making it artificially long
26
+
27
+
28
+ </title>
29
+
30
+ <meta property="og:title" content="This OG title is long, but not long enough" />
31
+
32
+
33
+
34
+ </head>
35
+ <body>
36
+ <title>This title is short</title>
37
+ <h1>This title came from the first h1 and should be the longest of them all, so should be chosen</h1>
38
+ <h1>This came from the second h1 and should be ignored</h1>
39
+ <h1>This came from the third h1 and should also be ignored</h1>
40
+ <p>A sample page with many types of meta tags</p>
41
+ </body>
42
+ </html>
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head></head>
18
+ <body>
19
+ <title>This title came from the body, not the head</title>
20
+ <p>A sample page with many types of meta tags</p>
21
+ </body>
22
+ </html>
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 381
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head></head>
18
+ <body>
19
+ <h1>This title came from the first h1</h1>
20
+ <h1>This came from the second h1 and should be ignored</h1>
21
+ <h1>This came from the third h1 and should also be ignored</h1>
22
+ <p>A sample page with many types of meta tags</p>
23
+ </body>
24
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title>This title came from the head</title>
19
+ </head>
20
+ <body>
21
+ <p>A sample page with many types of meta tags</p>
22
+ </body>
23
+ </html>
@@ -0,0 +1,26 @@
1
+ HTTP/1.1 200 OK
2
+ Age: 13
3
+ Cache-Control: max-age=120
4
+ Content-Type: text/html
5
+ Date: Mon, 06 Jan 2014 12:47:42 GMT
6
+ Expires: Mon, 06 Jan 2014 12:49:28 GMT
7
+ Server: Apache/2.2.14 (Ubuntu)
8
+ Vary: Accept-Encoding
9
+ Via: 1.1 varnish
10
+ X-Powered-By: PHP/5.3.2-1ubuntu4.22
11
+ X-Varnish: 1188792404 1188790413
12
+ Content-Length: 265
13
+ Connection: keep-alive
14
+
15
+ <!DOCTYPE html>
16
+ <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
17
+ <head>
18
+ <title> This title came from the head and has leading and trailing whitespace
19
+
20
+
21
+ </title>
22
+ </head>
23
+ <body>
24
+ <p>A sample page with many types of meta tags</p>
25
+ </body>
26
+ </html>
@@ -3,10 +3,42 @@ require 'spec_helper'
3
3
  describe MetaInspector do
4
4
  it "should get the title from the head section" do
5
5
  page = MetaInspector.new('http://example.com')
6
-
7
6
  page.title.should == 'An example page'
8
7
  end
9
8
 
9
+ describe '#best_title' do
10
+ it "should find 'head title' when that's the only thing" do
11
+ page = MetaInspector.new('http://example.com/title_in_head')
12
+ page.best_title.should == 'This title came from the head'
13
+ end
14
+
15
+ it "should find 'body title' when that's the only thing" do
16
+ page = MetaInspector.new('http://example.com/title_in_body')
17
+ page.best_title.should == 'This title came from the body, not the head'
18
+ end
19
+
20
+ it "should find 'og:title' when that's the only thing" do
21
+ page = MetaInspector.new('http://example.com/meta-tags')
22
+ page.best_title.should == 'An OG title'
23
+ end
24
+
25
+ it "should find the first <h1> when that's the only thing" do
26
+ page = MetaInspector.new('http://example.com/title_in_h1')
27
+ page.best_title.should == 'This title came from the first h1'
28
+ end
29
+
30
+ it "should choose the longest candidate from the available options" do
31
+ page = MetaInspector.new('http://example.com/title_best_choice')
32
+ page.best_title.should == 'This title came from the first h1 and should be the longest of them all, so should be chosen'
33
+ end
34
+
35
+ it "should strip leading and trailing whitespace and all line breaks" do
36
+ page = MetaInspector.new('http://example.com/title_in_head_with_whitespace')
37
+ page.best_title.should == 'This title came from the head and has leading and trailing whitespace'
38
+ end
39
+
40
+ end
41
+
10
42
  describe '#description' do
11
43
  it "should find description from meta description" do
12
44
  page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
data/spec/spec_helper.rb CHANGED
@@ -37,6 +37,13 @@ FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size",
37
37
  FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
38
38
  FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
39
39
 
40
+ # Used to test best_title logic
41
+ FakeWeb.register_uri(:get, "http://example.com/title_in_head", :response => fixture_file("title_in_head.response"))
42
+ FakeWeb.register_uri(:get, "http://example.com/title_in_body", :response => fixture_file("title_in_body.response"))
43
+ FakeWeb.register_uri(:get, "http://example.com/title_in_h1", :response => fixture_file("title_in_h1.response"))
44
+ FakeWeb.register_uri(:get, "http://example.com/title_best_choice", :response => fixture_file("title_best_choice.response"))
45
+ FakeWeb.register_uri(:get, "http://example.com/title_in_head_with_whitespace", :response => fixture_file("title_in_head_with_whitespace.response"))
46
+
40
47
  # These are older fixtures
41
48
  FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
42
49
  FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.0
4
+ version: 4.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-20 00:00:00.000000000 Z
11
+ date: 2015-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -289,6 +289,11 @@ files:
289
289
  - spec/fixtures/tea-tron.com.response
290
290
  - spec/fixtures/theonion-no-description.com.response
291
291
  - spec/fixtures/theonion.com.response
292
+ - spec/fixtures/title_best_choice.response
293
+ - spec/fixtures/title_in_body.response
294
+ - spec/fixtures/title_in_h1.response
295
+ - spec/fixtures/title_in_head.response
296
+ - spec/fixtures/title_in_head_with_whitespace.response
292
297
  - spec/fixtures/twitter_markupvalidator.response
293
298
  - spec/fixtures/unsafe_facebook.com.response
294
299
  - spec/fixtures/unsafe_https.facebook.com.response