metainspector 4.2.0 → 4.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/lib/meta_inspector/document.rb +2 -1
- data/lib/meta_inspector/parser.rb +1 -1
- data/lib/meta_inspector/parsers/texts.rb +18 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/title_best_choice.response +42 -0
- data/spec/fixtures/title_in_body.response +22 -0
- data/spec/fixtures/title_in_h1.response +24 -0
- data/spec/fixtures/title_in_head.response +23 -0
- data/spec/fixtures/title_in_head_with_whitespace.response +26 -0
- data/spec/meta_inspector/texts_spec.rb +33 -1
- data/spec/spec_helper.rb +7 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd4bfdcaa225ae095a22dc7d853daa2046e5c764
|
4
|
+
data.tar.gz: 6d7588c9732ea0a4f2e6512c92aff721874197e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85974aa0874dfd9e2f90c97416ceac9c9512ce4919cc8f90a2e74078d388d5e0d214db16ef13958975d110760ef88ff861cd16d0e8a654bcc00df142fa2616b9
|
7
|
+
data.tar.gz: 0274b0b000bc61fd88f08090c2ede9bed40055f7f351c522991e73289585a1948fedd4bd158a1cf07749cd6d31c8dccaac051903a684f7c341a3004471c830a8
|
data/README.md
CHANGED
@@ -8,6 +8,13 @@ You give it an URL, and it lets you easily get its title, links, images, charset
|
|
8
8
|
|
9
9
|
You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
|
10
10
|
|
11
|
+
## Changes in 4.2.1
|
12
|
+
|
13
|
+
* The Document API has been extended with one new method:
|
14
|
+
|
15
|
+
* `page.best_title` returns the longest text available from a selection of candidates.
|
16
|
+
|
17
|
+
|
11
18
|
## Changes in 4.2.0
|
12
19
|
|
13
20
|
* The images API has been extended, with two new methods:
|
@@ -48,7 +48,8 @@ module MetaInspector
|
|
48
48
|
|
49
49
|
delegate [:content_type, :response] => :@request
|
50
50
|
|
51
|
-
delegate [:parsed, :title, :
|
51
|
+
delegate [:parsed, :title, :best_title,
|
52
|
+
:description, :links,
|
52
53
|
:images, :feed, :charset, :meta_tags,
|
53
54
|
:meta_tag, :meta, :favicon] => :@parser
|
54
55
|
|
@@ -25,7 +25,7 @@ module MetaInspector
|
|
25
25
|
delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
|
26
26
|
delegate [:links, :feed, :base_url] => :@links_parser
|
27
27
|
delegate :images => :@images_parser
|
28
|
-
delegate [:title, :description]
|
28
|
+
delegate [:title, :best_title, :description] => :@texts_parser
|
29
29
|
|
30
30
|
# Returns the whole parsed document
|
31
31
|
def parsed
|
@@ -9,6 +9,24 @@ module MetaInspector
|
|
9
9
|
@title ||= parsed.css('head title').inner_text rescue nil
|
10
10
|
end
|
11
11
|
|
12
|
+
def best_title
|
13
|
+
@best_title ||= begin
|
14
|
+
candidates = [
|
15
|
+
parsed.css('head title'),
|
16
|
+
parsed.css('body title'),
|
17
|
+
meta['og:title'],
|
18
|
+
parsed.css('h1').first
|
19
|
+
]
|
20
|
+
candidates.flatten!
|
21
|
+
candidates.map! { |c| (c.respond_to? :inner_text) ? c.inner_text : c }
|
22
|
+
candidates.compact!
|
23
|
+
candidates.map! { |c| c.gsub(/\s+/, ' ') }
|
24
|
+
candidates.uniq!
|
25
|
+
candidates.sort_by! { |t| -t.length }
|
26
|
+
candidates.first.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
12
30
|
# A description getter that first checks for a meta description
|
13
31
|
# and if not present will guess by looking at the first paragraph
|
14
32
|
# with more than 120 characters
|
@@ -0,0 +1,42 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 695
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title>This title
|
19
|
+
|
20
|
+
is in
|
21
|
+
|
22
|
+
|
23
|
+
the head
|
24
|
+
|
25
|
+
and has blank lines in it, making it artificially long
|
26
|
+
|
27
|
+
|
28
|
+
</title>
|
29
|
+
|
30
|
+
<meta property="og:title" content="This OG title is long, but not long enough" />
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
</head>
|
35
|
+
<body>
|
36
|
+
<title>This title is short</title>
|
37
|
+
<h1>This title came from the first h1 and should be the longest of them all, so should be chosen</h1>
|
38
|
+
<h1>This came from the second h1 and should be ignored</h1>
|
39
|
+
<h1>This came from the third h1 and should also be ignored</h1>
|
40
|
+
<p>A sample page with many types of meta tags</p>
|
41
|
+
</body>
|
42
|
+
</html>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head></head>
|
18
|
+
<body>
|
19
|
+
<title>This title came from the body, not the head</title>
|
20
|
+
<p>A sample page with many types of meta tags</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 381
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head></head>
|
18
|
+
<body>
|
19
|
+
<h1>This title came from the first h1</h1>
|
20
|
+
<h1>This came from the second h1 and should be ignored</h1>
|
21
|
+
<h1>This came from the third h1 and should also be ignored</h1>
|
22
|
+
<p>A sample page with many types of meta tags</p>
|
23
|
+
</body>
|
24
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title>This title came from the head</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample page with many types of meta tags</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,26 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title> This title came from the head and has leading and trailing whitespace
|
19
|
+
|
20
|
+
|
21
|
+
</title>
|
22
|
+
</head>
|
23
|
+
<body>
|
24
|
+
<p>A sample page with many types of meta tags</p>
|
25
|
+
</body>
|
26
|
+
</html>
|
@@ -3,10 +3,42 @@ require 'spec_helper'
|
|
3
3
|
describe MetaInspector do
|
4
4
|
it "should get the title from the head section" do
|
5
5
|
page = MetaInspector.new('http://example.com')
|
6
|
-
|
7
6
|
page.title.should == 'An example page'
|
8
7
|
end
|
9
8
|
|
9
|
+
describe '#best_title' do
|
10
|
+
it "should find 'head title' when that's the only thing" do
|
11
|
+
page = MetaInspector.new('http://example.com/title_in_head')
|
12
|
+
page.best_title.should == 'This title came from the head'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should find 'body title' when that's the only thing" do
|
16
|
+
page = MetaInspector.new('http://example.com/title_in_body')
|
17
|
+
page.best_title.should == 'This title came from the body, not the head'
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should find 'og:title' when that's the only thing" do
|
21
|
+
page = MetaInspector.new('http://example.com/meta-tags')
|
22
|
+
page.best_title.should == 'An OG title'
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should find the first <h1> when that's the only thing" do
|
26
|
+
page = MetaInspector.new('http://example.com/title_in_h1')
|
27
|
+
page.best_title.should == 'This title came from the first h1'
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should choose the longest candidate from the available options" do
|
31
|
+
page = MetaInspector.new('http://example.com/title_best_choice')
|
32
|
+
page.best_title.should == 'This title came from the first h1 and should be the longest of them all, so should be chosen'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should strip leading and trailing whitespace and all line breaks" do
|
36
|
+
page = MetaInspector.new('http://example.com/title_in_head_with_whitespace')
|
37
|
+
page.best_title.should == 'This title came from the head and has leading and trailing whitespace'
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
10
42
|
describe '#description' do
|
11
43
|
it "should find description from meta description" do
|
12
44
|
page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
data/spec/spec_helper.rb
CHANGED
@@ -37,6 +37,13 @@ FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size",
|
|
37
37
|
FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
|
38
38
|
FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
|
39
39
|
|
40
|
+
# Used to test best_title logic
|
41
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_head", :response => fixture_file("title_in_head.response"))
|
42
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_body", :response => fixture_file("title_in_body.response"))
|
43
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_h1", :response => fixture_file("title_in_h1.response"))
|
44
|
+
FakeWeb.register_uri(:get, "http://example.com/title_best_choice", :response => fixture_file("title_best_choice.response"))
|
45
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_head_with_whitespace", :response => fixture_file("title_in_head_with_whitespace.response"))
|
46
|
+
|
40
47
|
# These are older fixtures
|
41
48
|
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
42
49
|
FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.2.
|
4
|
+
version: 4.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -289,6 +289,11 @@ files:
|
|
289
289
|
- spec/fixtures/tea-tron.com.response
|
290
290
|
- spec/fixtures/theonion-no-description.com.response
|
291
291
|
- spec/fixtures/theonion.com.response
|
292
|
+
- spec/fixtures/title_best_choice.response
|
293
|
+
- spec/fixtures/title_in_body.response
|
294
|
+
- spec/fixtures/title_in_h1.response
|
295
|
+
- spec/fixtures/title_in_head.response
|
296
|
+
- spec/fixtures/title_in_head_with_whitespace.response
|
292
297
|
- spec/fixtures/twitter_markupvalidator.response
|
293
298
|
- spec/fixtures/unsafe_facebook.com.response
|
294
299
|
- spec/fixtures/unsafe_https.facebook.com.response
|