metainspector 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/lib/meta_inspector/document.rb +2 -1
- data/lib/meta_inspector/parser.rb +1 -1
- data/lib/meta_inspector/parsers/texts.rb +18 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/title_best_choice.response +42 -0
- data/spec/fixtures/title_in_body.response +22 -0
- data/spec/fixtures/title_in_h1.response +24 -0
- data/spec/fixtures/title_in_head.response +23 -0
- data/spec/fixtures/title_in_head_with_whitespace.response +26 -0
- data/spec/meta_inspector/texts_spec.rb +33 -1
- data/spec/spec_helper.rb +7 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd4bfdcaa225ae095a22dc7d853daa2046e5c764
|
4
|
+
data.tar.gz: 6d7588c9732ea0a4f2e6512c92aff721874197e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85974aa0874dfd9e2f90c97416ceac9c9512ce4919cc8f90a2e74078d388d5e0d214db16ef13958975d110760ef88ff861cd16d0e8a654bcc00df142fa2616b9
|
7
|
+
data.tar.gz: 0274b0b000bc61fd88f08090c2ede9bed40055f7f351c522991e73289585a1948fedd4bd158a1cf07749cd6d31c8dccaac051903a684f7c341a3004471c830a8
|
data/README.md
CHANGED
@@ -8,6 +8,13 @@ You give it an URL, and it lets you easily get its title, links, images, charset
|
|
8
8
|
|
9
9
|
You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
|
10
10
|
|
11
|
+
## Changes in 4.2.1
|
12
|
+
|
13
|
+
* The Document API has been extended with one new method:
|
14
|
+
|
15
|
+
* `page.best_title` returns the longest text available from a selection of candidates.
|
16
|
+
|
17
|
+
|
11
18
|
## Changes in 4.2.0
|
12
19
|
|
13
20
|
* The images API has been extended, with two new methods:
|
@@ -48,7 +48,8 @@ module MetaInspector
|
|
48
48
|
|
49
49
|
delegate [:content_type, :response] => :@request
|
50
50
|
|
51
|
-
delegate [:parsed, :title, :
|
51
|
+
delegate [:parsed, :title, :best_title,
|
52
|
+
:description, :links,
|
52
53
|
:images, :feed, :charset, :meta_tags,
|
53
54
|
:meta_tag, :meta, :favicon] => :@parser
|
54
55
|
|
@@ -25,7 +25,7 @@ module MetaInspector
|
|
25
25
|
delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
|
26
26
|
delegate [:links, :feed, :base_url] => :@links_parser
|
27
27
|
delegate :images => :@images_parser
|
28
|
-
delegate [:title, :description]
|
28
|
+
delegate [:title, :best_title, :description] => :@texts_parser
|
29
29
|
|
30
30
|
# Returns the whole parsed document
|
31
31
|
def parsed
|
@@ -9,6 +9,24 @@ module MetaInspector
|
|
9
9
|
@title ||= parsed.css('head title').inner_text rescue nil
|
10
10
|
end
|
11
11
|
|
12
|
+
def best_title
|
13
|
+
@best_title ||= begin
|
14
|
+
candidates = [
|
15
|
+
parsed.css('head title'),
|
16
|
+
parsed.css('body title'),
|
17
|
+
meta['og:title'],
|
18
|
+
parsed.css('h1').first
|
19
|
+
]
|
20
|
+
candidates.flatten!
|
21
|
+
candidates.map! { |c| (c.respond_to? :inner_text) ? c.inner_text : c }
|
22
|
+
candidates.compact!
|
23
|
+
candidates.map! { |c| c.gsub(/\s+/, ' ') }
|
24
|
+
candidates.uniq!
|
25
|
+
candidates.sort_by! { |t| -t.length }
|
26
|
+
candidates.first.strip
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
12
30
|
# A description getter that first checks for a meta description
|
13
31
|
# and if not present will guess by looking at the first paragraph
|
14
32
|
# with more than 120 characters
|
@@ -0,0 +1,42 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 695
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title>This title
|
19
|
+
|
20
|
+
is in
|
21
|
+
|
22
|
+
|
23
|
+
the head
|
24
|
+
|
25
|
+
and has blank lines in it, making it artificially long
|
26
|
+
|
27
|
+
|
28
|
+
</title>
|
29
|
+
|
30
|
+
<meta property="og:title" content="This OG title is long, but not long enough" />
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
</head>
|
35
|
+
<body>
|
36
|
+
<title>This title is short</title>
|
37
|
+
<h1>This title came from the first h1 and should be the longest of them all, so should be chosen</h1>
|
38
|
+
<h1>This came from the second h1 and should be ignored</h1>
|
39
|
+
<h1>This came from the third h1 and should also be ignored</h1>
|
40
|
+
<p>A sample page with many types of meta tags</p>
|
41
|
+
</body>
|
42
|
+
</html>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head></head>
|
18
|
+
<body>
|
19
|
+
<title>This title came from the body, not the head</title>
|
20
|
+
<p>A sample page with many types of meta tags</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 381
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head></head>
|
18
|
+
<body>
|
19
|
+
<h1>This title came from the first h1</h1>
|
20
|
+
<h1>This came from the second h1 and should be ignored</h1>
|
21
|
+
<h1>This came from the third h1 and should also be ignored</h1>
|
22
|
+
<p>A sample page with many types of meta tags</p>
|
23
|
+
</body>
|
24
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title>This title came from the head</title>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<p>A sample page with many types of meta tags</p>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,26 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Age: 13
|
3
|
+
Cache-Control: max-age=120
|
4
|
+
Content-Type: text/html
|
5
|
+
Date: Mon, 06 Jan 2014 12:47:42 GMT
|
6
|
+
Expires: Mon, 06 Jan 2014 12:49:28 GMT
|
7
|
+
Server: Apache/2.2.14 (Ubuntu)
|
8
|
+
Vary: Accept-Encoding
|
9
|
+
Via: 1.1 varnish
|
10
|
+
X-Powered-By: PHP/5.3.2-1ubuntu4.22
|
11
|
+
X-Varnish: 1188792404 1188790413
|
12
|
+
Content-Length: 265
|
13
|
+
Connection: keep-alive
|
14
|
+
|
15
|
+
<!DOCTYPE html>
|
16
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
|
17
|
+
<head>
|
18
|
+
<title> This title came from the head and has leading and trailing whitespace
|
19
|
+
|
20
|
+
|
21
|
+
</title>
|
22
|
+
</head>
|
23
|
+
<body>
|
24
|
+
<p>A sample page with many types of meta tags</p>
|
25
|
+
</body>
|
26
|
+
</html>
|
@@ -3,10 +3,42 @@ require 'spec_helper'
|
|
3
3
|
describe MetaInspector do
|
4
4
|
it "should get the title from the head section" do
|
5
5
|
page = MetaInspector.new('http://example.com')
|
6
|
-
|
7
6
|
page.title.should == 'An example page'
|
8
7
|
end
|
9
8
|
|
9
|
+
describe '#best_title' do
|
10
|
+
it "should find 'head title' when that's the only thing" do
|
11
|
+
page = MetaInspector.new('http://example.com/title_in_head')
|
12
|
+
page.best_title.should == 'This title came from the head'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should find 'body title' when that's the only thing" do
|
16
|
+
page = MetaInspector.new('http://example.com/title_in_body')
|
17
|
+
page.best_title.should == 'This title came from the body, not the head'
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should find 'og:title' when that's the only thing" do
|
21
|
+
page = MetaInspector.new('http://example.com/meta-tags')
|
22
|
+
page.best_title.should == 'An OG title'
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should find the first <h1> when that's the only thing" do
|
26
|
+
page = MetaInspector.new('http://example.com/title_in_h1')
|
27
|
+
page.best_title.should == 'This title came from the first h1'
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should choose the longest candidate from the available options" do
|
31
|
+
page = MetaInspector.new('http://example.com/title_best_choice')
|
32
|
+
page.best_title.should == 'This title came from the first h1 and should be the longest of them all, so should be chosen'
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should strip leading and trailing whitespace and all line breaks" do
|
36
|
+
page = MetaInspector.new('http://example.com/title_in_head_with_whitespace')
|
37
|
+
page.best_title.should == 'This title came from the head and has leading and trailing whitespace'
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
10
42
|
describe '#description' do
|
11
43
|
it "should find description from meta description" do
|
12
44
|
page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
data/spec/spec_helper.rb
CHANGED
@@ -37,6 +37,13 @@ FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size",
|
|
37
37
|
FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
|
38
38
|
FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
|
39
39
|
|
40
|
+
# Used to test best_title logic
|
41
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_head", :response => fixture_file("title_in_head.response"))
|
42
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_body", :response => fixture_file("title_in_body.response"))
|
43
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_h1", :response => fixture_file("title_in_h1.response"))
|
44
|
+
FakeWeb.register_uri(:get, "http://example.com/title_best_choice", :response => fixture_file("title_best_choice.response"))
|
45
|
+
FakeWeb.register_uri(:get, "http://example.com/title_in_head_with_whitespace", :response => fixture_file("title_in_head_with_whitespace.response"))
|
46
|
+
|
40
47
|
# These are older fixtures
|
41
48
|
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
42
49
|
FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.2.
|
4
|
+
version: 4.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -289,6 +289,11 @@ files:
|
|
289
289
|
- spec/fixtures/tea-tron.com.response
|
290
290
|
- spec/fixtures/theonion-no-description.com.response
|
291
291
|
- spec/fixtures/theonion.com.response
|
292
|
+
- spec/fixtures/title_best_choice.response
|
293
|
+
- spec/fixtures/title_in_body.response
|
294
|
+
- spec/fixtures/title_in_h1.response
|
295
|
+
- spec/fixtures/title_in_head.response
|
296
|
+
- spec/fixtures/title_in_head_with_whitespace.response
|
292
297
|
- spec/fixtures/twitter_markupvalidator.response
|
293
298
|
- spec/fixtures/unsafe_facebook.com.response
|
294
299
|
- spec/fixtures/unsafe_https.facebook.com.response
|