metainspector 4.1.0 → 4.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -0
- data/lib/meta_inspector/document.rb +13 -10
- data/lib/meta_inspector/parser.rb +2 -1
- data/lib/meta_inspector/parsers/images.rb +41 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/fixtures/100x100.jpg.response +0 -0
- data/spec/fixtures/10x10.jpg.response +0 -0
- data/spec/fixtures/largest_image_in_html.response +23 -0
- data/spec/fixtures/largest_image_using_image_size.response +23 -0
- data/spec/meta_inspector/images_spec.rb +43 -3
- data/spec/spec_helper.rb +6 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bbcf96088ef49b859442dfd0244bda8b7e4870fb
|
4
|
+
data.tar.gz: 3f87e155f4e1d260f6eff96867b458278a8a450b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 005a2f07c88b2ca40bcf970ef7eabe2eb0b40d76bb1556a67a0bfbbcc170ab6be1836942dd841d97cd9a110fb25dcd8cd6e3aeb3f164cd2f3dcc020bb7708d27
|
7
|
+
data.tar.gz: 26ef69520abd2564e431a22dd3e6d139263e7163a250478c8a70dd41d3fadbfb22a3828757d645192d763d7ea0e2c5298b53d874a3f999b03ff1d68451ad422b
|
data/README.md
CHANGED
@@ -8,6 +8,19 @@ You give it an URL, and it lets you easily get its title, links, images, charset
|
|
8
8
|
|
9
9
|
You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
|
10
10
|
|
11
|
+
## Changes in 4.2.0
|
12
|
+
|
13
|
+
* The images API has been extended, with two new methods:
|
14
|
+
|
15
|
+
* `page.images.owner_suggested` returns the OG or Twitter image, or `nil` if neither are present.
|
16
|
+
* `page.images.largest` returns the largest image found in the page. This uses the HTML height and width attributes as well as the [fastimage](https://github.com/sdsykes/fastimage) gem to return the largest image on the page that has a ratio squarer than 1:10 or 10:1. This usually provides a good alternative to the OG or Twitter images if they are not supplied.
|
17
|
+
|
18
|
+
* The criteria for `page.images.best` has changed slightly, we'll now return the largest image instead of the first image if no owner-suggested image is found.
|
19
|
+
|
20
|
+
## Changes in 4.1.0
|
21
|
+
|
22
|
+
* Introduces the `:normalize_url` option, which allows to disable URL normalization.
|
23
|
+
|
11
24
|
## Changes in 4.0
|
12
25
|
|
13
26
|
* The links API has been changed, now instead of `page.links`, `page.internal_links` and `page.external_links` we have:
|
@@ -339,6 +352,16 @@ While this is generally useful, it can be [tricky](https://github.com/sporkmonge
|
|
339
352
|
|
340
353
|
You can disable URL normalization by passing the `normalize_url: false` option.
|
341
354
|
|
355
|
+
### Image downloading
|
356
|
+
|
357
|
+
When you ask for the largest image on the page with `page.images.largest`, it will be determined by its height and width attributes on the HTML markup, and also by downloading a small portion of each image using the [fastimage](https://github.com/sdsykes/fastimage) gem. This is really fast as it doesn't download the entire images, normally just the headers of the image files.
|
358
|
+
|
359
|
+
If you want to disable this, you can specify it like this:
|
360
|
+
|
361
|
+
```ruby
|
362
|
+
page = MetaInspector.new('http://example.com', download_images: false)
|
363
|
+
```
|
364
|
+
|
342
365
|
## Exception Handling
|
343
366
|
|
344
367
|
By default, MetaInspector will raise the exceptions found. We think that this is the safest default: in case the URL you're trying to scrape is unreachable, you should clearly be notified, and treat the exception as needed in your app.
|
@@ -26,19 +26,21 @@ module MetaInspector
|
|
26
26
|
@html_content_only = options[:html_content_only]
|
27
27
|
@allow_redirections = options[:allow_redirections]
|
28
28
|
@document = options[:document]
|
29
|
+
@download_images = options[:download_images]
|
29
30
|
@headers = options[:headers]
|
30
31
|
@warn_level = options[:warn_level]
|
31
32
|
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
32
33
|
@normalize_url = options[:normalize_url]
|
33
|
-
@url = MetaInspector::URL.new(initial_url, exception_log:
|
34
|
-
normalize:
|
35
|
-
@request = MetaInspector::Request.new(@url,
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
@parser = MetaInspector::Parser.new(self,
|
34
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log,
|
35
|
+
normalize: @normalize_url)
|
36
|
+
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
37
|
+
connection_timeout: @connection_timeout,
|
38
|
+
read_timeout: @read_timeout,
|
39
|
+
retries: @retries,
|
40
|
+
exception_log: @exception_log,
|
41
|
+
headers: @headers) unless @document
|
42
|
+
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log,
|
43
|
+
download_images: @download_images)
|
42
44
|
end
|
43
45
|
|
44
46
|
extend Forwardable
|
@@ -81,7 +83,8 @@ module MetaInspector
|
|
81
83
|
:warn_level => :raise,
|
82
84
|
:headers => { 'User-Agent' => default_user_agent },
|
83
85
|
:allow_redirections => true,
|
84
|
-
:normalize_url => true
|
86
|
+
:normalize_url => true,
|
87
|
+
:download_images => true }
|
85
88
|
end
|
86
89
|
|
87
90
|
def default_user_agent
|
@@ -15,7 +15,8 @@ module MetaInspector
|
|
15
15
|
@exception_log = options[:exception_log]
|
16
16
|
@meta_tag_parser = MetaInspector::Parsers::MetaTagsParser.new(self)
|
17
17
|
@links_parser = MetaInspector::Parsers::LinksParser.new(self)
|
18
|
-
@
|
18
|
+
@download_images = options[:download_images]
|
19
|
+
@images_parser = MetaInspector::Parsers::ImagesParser.new(self, download_images: @download_images)
|
19
20
|
@texts_parser = MetaInspector::Parsers::TextsParser.new(self)
|
20
21
|
end
|
21
22
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'fastimage'
|
2
|
+
|
1
3
|
module MetaInspector
|
2
4
|
module Parsers
|
3
5
|
class ImagesParser < Base
|
@@ -6,16 +8,53 @@ module MetaInspector
|
|
6
8
|
|
7
9
|
include Enumerable
|
8
10
|
|
11
|
+
def initialize(main_parser, options = {})
|
12
|
+
@download_images = options[:download_images]
|
13
|
+
super(main_parser)
|
14
|
+
end
|
15
|
+
|
9
16
|
def images
|
10
17
|
self
|
11
18
|
end
|
12
19
|
|
20
|
+
# Returns either the Facebook Open Graph image, twitter suggested image or
|
21
|
+
# the largest image in the image collection
|
22
|
+
def best
|
23
|
+
owner_suggested || largest
|
24
|
+
end
|
25
|
+
|
13
26
|
# Returns the parsed image from Facebook's open graph property tags
|
14
27
|
# Most major websites now define this property and is usually relevant
|
15
28
|
# See doc at http://developers.facebook.com/docs/opengraph/
|
16
29
|
# If none found, tries with Twitter image
|
17
|
-
def
|
18
|
-
meta['og:image'] || meta['twitter:image']
|
30
|
+
def owner_suggested
|
31
|
+
meta['og:image'] || meta['twitter:image']
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the largest image from the image collection,
|
35
|
+
# filtered for images that are more square than 10:1 or 1:10
|
36
|
+
def largest()
|
37
|
+
@larget_image ||= begin
|
38
|
+
img_nodes = parsed.search('//img')
|
39
|
+
sizes = img_nodes.map { |img_node| [URL.absolutify(img_node['src'], base_url), img_node['width'], img_node['height']] }
|
40
|
+
sizes.uniq! { |url, width, height| url }
|
41
|
+
if @download_images
|
42
|
+
sizes.map! do |url, width, height|
|
43
|
+
width, height = FastImage.size(url) if width.nil? || height.nil?
|
44
|
+
[url, width, height]
|
45
|
+
end
|
46
|
+
else
|
47
|
+
sizes.map! do |url, width, height|
|
48
|
+
width, height = [0, 0] if width.nil? || height.nil?
|
49
|
+
[url, width, height]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
sizes.map! { |url, width, height| [url, width.to_i * height.to_i, width.to_f / height.to_f] }
|
53
|
+
sizes.keep_if { |url, area, ratio| ratio > 0.1 && ratio < 10 }
|
54
|
+
sizes.sort_by! { |url, area, ratio| -area }
|
55
|
+
url, area, ratio = sizes.first
|
56
|
+
url
|
57
|
+
end
|
19
58
|
end
|
20
59
|
|
21
60
|
# Return favicon url if exist
|
data/meta_inspector.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_dependency 'faraday_middleware', '~> 0.9.1'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
|
21
21
|
gem.add_dependency 'addressable', '~> 2.3.5'
|
22
|
+
gem.add_dependency 'fastimage'
|
22
23
|
|
23
24
|
gem.add_development_dependency 'rspec', '2.14.1'
|
24
25
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
Binary file
|
Binary file
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>An example page</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<img src="/too_narrow" width="10" height="100" />
|
18
|
+
<img src="/smaller" width="10" height="10" />
|
19
|
+
<img src="/largest" width="100" height="100" />
|
20
|
+
<img src="/too_wide" width="100" height="10" />
|
21
|
+
<img src="/smallest" width="1" height="1" />
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>An example page</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<img src="/10x100" width="10" height="100" />
|
18
|
+
<img src="/10x10" />
|
19
|
+
<img src="/100x100" />
|
20
|
+
<img src="/100x10" width="100" height="10" />
|
21
|
+
<img src="/1x1" width="1" height="1" />
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -70,7 +70,7 @@ describe MetaInspector do
|
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
|
-
describe "
|
73
|
+
describe "images.best" do
|
74
74
|
it "should find the og image" do
|
75
75
|
page = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
76
76
|
|
@@ -84,9 +84,49 @@ describe MetaInspector do
|
|
84
84
|
end
|
85
85
|
|
86
86
|
it "should find image when og:image and twitter:image metatags are missing" do
|
87
|
-
page = MetaInspector.new('http://
|
87
|
+
page = MetaInspector.new('http://example.com/largest_image_using_image_size')
|
88
88
|
|
89
|
-
page.images.best.should == "http://
|
89
|
+
page.images.best.should == "http://example.com/100x100"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "images.owner_suggested" do
|
94
|
+
it "should find the og image" do
|
95
|
+
page = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
96
|
+
|
97
|
+
page.images.owner_suggested.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should find image on youtube" do
|
101
|
+
page = MetaInspector.new('http://www.youtube.com/watch?v=iaGSSrp49uc')
|
102
|
+
|
103
|
+
page.images.owner_suggested.should == "http://i2.ytimg.com/vi/iaGSSrp49uc/mqdefault.jpg"
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should return nil when og:image and twitter:image metatags are missing" do
|
107
|
+
page = MetaInspector.new('http://example.com/largest_image_using_image_size')
|
108
|
+
|
109
|
+
page.images.owner_suggested.should be nil
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe "images.largest" do
|
114
|
+
it "should find the largest image on the page using html sizes" do
|
115
|
+
page = MetaInspector.new('http://example.com/largest_image_in_html')
|
116
|
+
|
117
|
+
page.images.largest.should == "http://example.com/largest"
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should find the largest image on the page using actual image sizes" do
|
121
|
+
page = MetaInspector.new('http://example.com/largest_image_using_image_size')
|
122
|
+
|
123
|
+
page.images.largest.should == "http://example.com/100x100"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should find the largest image without downloading images" do
|
127
|
+
page = MetaInspector.new('http://example.com/largest_image_using_image_size', download_images: false)
|
128
|
+
|
129
|
+
page.images.largest.should == "http://example.com/1x1"
|
90
130
|
end
|
91
131
|
end
|
92
132
|
|
data/spec/spec_helper.rb
CHANGED
@@ -31,6 +31,12 @@ FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("exa
|
|
31
31
|
# Used to test response status codes
|
32
32
|
FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
|
33
33
|
|
34
|
+
# Used to test largest image in page logic
|
35
|
+
FakeWeb.register_uri(:get, "http://example.com/largest_image_in_html", :response => fixture_file("largest_image_in_html.response"))
|
36
|
+
FakeWeb.register_uri(:get, "http://example.com/largest_image_using_image_size", :response => fixture_file("largest_image_using_image_size.response"))
|
37
|
+
FakeWeb.register_uri(:get, "http://example.com/10x10", :response => fixture_file("10x10.jpg.response"))
|
38
|
+
FakeWeb.register_uri(:get, "http://example.com/100x100", :response => fixture_file("100x100.jpg.response"))
|
39
|
+
|
34
40
|
# These are older fixtures
|
35
41
|
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
36
42
|
FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 2.3.5
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: fastimage
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rspec
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -243,6 +257,8 @@ files:
|
|
243
257
|
- meta_inspector.gemspec
|
244
258
|
- spec/document_spec.rb
|
245
259
|
- spec/exception_log_spec.rb
|
260
|
+
- spec/fixtures/100x100.jpg.response
|
261
|
+
- spec/fixtures/10x10.jpg.response
|
246
262
|
- spec/fixtures/404.response
|
247
263
|
- spec/fixtures/alazan.com.response
|
248
264
|
- spec/fixtures/alazan_websolution.response
|
@@ -257,6 +273,8 @@ files:
|
|
257
273
|
- spec/fixtures/international.response
|
258
274
|
- spec/fixtures/invalid_href.response
|
259
275
|
- spec/fixtures/iteh.at.response
|
276
|
+
- spec/fixtures/largest_image_in_html.response
|
277
|
+
- spec/fixtures/largest_image_using_image_size.response
|
260
278
|
- spec/fixtures/malformed_href.response
|
261
279
|
- spec/fixtures/markupvalidator_faqs.response
|
262
280
|
- spec/fixtures/meta_tags.response
|