thumbnail_scraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +36 -0
- data/LICENSE +22 -0
- data/README.md +37 -0
- data/Rakefile +5 -0
- data/lib/thumbnail_scraper.rb +44 -0
- data/lib/thumbnail_scraper/http_receiver.rb +17 -0
- data/lib/thumbnail_scraper/image.rb +28 -0
- data/lib/thumbnail_scraper/version.rb +3 -0
- data/lib/thumbnail_scraper/webpage.rb +65 -0
- data/lib/thumbnail_scraper/with_smart_url.rb +17 -0
- data/spec/sample_pages/images.html +12 -0
- data/spec/sample_pages/images/biggest.jpg +0 -0
- data/spec/sample_pages/images/kitty.jpg +0 -0
- data/spec/sample_pages/images/subcatalog/smallest.jpg +0 -0
- data/spec/sample_pages/img_src.html +9 -0
- data/spec/sample_pages/og_image.html +8 -0
- data/spec/thumbnail_scraper_helper.rb +11 -0
- data/spec/thumbnail_scraper_spec.rb +52 -0
- data/spec/units/http_receiver_spec.rb +24 -0
- data/spec/units/thumbnail_scraper_spec.rb +182 -0
- data/spec/units/webpage_spec.rb +177 -0
- data/thumbnail_scraper.gemspec +23 -0
- metadata +161 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
thumbnail_scraper (0.0.1)
|
5
|
+
fastimage
|
6
|
+
httpclient
|
7
|
+
nokogiri
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
addressable (2.2.8)
|
13
|
+
crack (0.3.1)
|
14
|
+
diff-lcs (1.1.3)
|
15
|
+
fastimage (1.2.13)
|
16
|
+
httpclient (2.2.5)
|
17
|
+
nokogiri (1.5.3)
|
18
|
+
rspec (2.11.0)
|
19
|
+
rspec-core (~> 2.11.0)
|
20
|
+
rspec-expectations (~> 2.11.0)
|
21
|
+
rspec-mocks (~> 2.11.0)
|
22
|
+
rspec-core (2.11.1)
|
23
|
+
rspec-expectations (2.11.1)
|
24
|
+
diff-lcs (~> 1.1.3)
|
25
|
+
rspec-mocks (2.11.1)
|
26
|
+
webmock (1.7.6)
|
27
|
+
addressable (~> 2.2, > 2.2.5)
|
28
|
+
crack (>= 0.1.7)
|
29
|
+
|
30
|
+
PLATFORMS
|
31
|
+
ruby
|
32
|
+
|
33
|
+
DEPENDENCIES
|
34
|
+
rspec
|
35
|
+
thumbnail_scraper!
|
36
|
+
webmock
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jan Filipowski
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Thumbnail Scraper
|
2
|
+
============
|
3
|
+
|
4
|
+
detect, fetch and generate a thumbnail for any url
|
5
|
+
|
6
|
+
Basic usage
|
7
|
+
-------------
|
8
|
+
```ruby
|
9
|
+
require 'thumbnail_scraper'
|
10
|
+
|
11
|
+
include ThumbnailScraper
|
12
|
+
scraper = ThumbnailScraper.new
|
13
|
+
image = scraper.image_to_thumbnail_url("http://www.monibuds.com/")
|
14
|
+
thumbnail_url = image.url
|
15
|
+
```
|
16
|
+
|
17
|
+
ThumbnailScraper#image_to_thumbnail_url method returns Image object, which contains its size and url.
|
18
|
+
|
19
|
+
Suggested usage
|
20
|
+
---------------
|
21
|
+
|
22
|
+
We encourage you to use it with delayed_job as jobs queue and dragonfly as image storage tool. Your job could look like following:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
require 'thumbnail_scraper'
|
26
|
+
|
27
|
+
module Jobs
|
28
|
+
class ScrapThumbnailJob < Struct.new(:page)
|
29
|
+
def perform
|
30
|
+
scraper = ::ThumbnailScraper::ThumbnailScraper.new
|
31
|
+
image = scraper.image_to_thumbnail_for_url(page.url)
|
32
|
+
page.thumbnail_url = image.url.to_s
|
33
|
+
page.save!
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require "thumbnail_scraper/version"
|
2
|
+
require 'thumbnail_scraper/image'
|
3
|
+
require 'thumbnail_scraper/http_receiver'
|
4
|
+
|
5
|
+
module ThumbnailScraper
|
6
|
+
class ThumbnailScraper
|
7
|
+
attr_accessor :http_receiver
|
8
|
+
|
9
|
+
def initialize(receiver=HttpReceiver.new)
|
10
|
+
@http_receiver = receiver
|
11
|
+
end
|
12
|
+
|
13
|
+
def create_image(url)
|
14
|
+
Image.new(url)
|
15
|
+
end
|
16
|
+
|
17
|
+
def image_to_thumbnail_for_url(url)
|
18
|
+
webpage = http_receiver.receive_webpage(url)
|
19
|
+
if webpage.has_open_graph_image?
|
20
|
+
image = create_image(webpage.open_graph_image_url)
|
21
|
+
elsif webpage.has_linked_image?
|
22
|
+
image = create_image(webpage.linked_image_url)
|
23
|
+
else
|
24
|
+
image = select_best_possible_image_to_scrap(webpage.attached_images_urls)
|
25
|
+
end
|
26
|
+
image
|
27
|
+
end
|
28
|
+
|
29
|
+
def select_best_possible_image_to_scrap(images_urls)
|
30
|
+
images = images_urls.map{|image_url| create_image(image_url)}
|
31
|
+
valid_images = select_valid_images(images)
|
32
|
+
return nil if valid_images.empty?
|
33
|
+
valid_images.max{|a, b| a.area <=> b.area}
|
34
|
+
end
|
35
|
+
|
36
|
+
def select_valid_images(images)
|
37
|
+
images.select{|image| image_is_valid?(image)}
|
38
|
+
end
|
39
|
+
|
40
|
+
def image_is_valid?(image)
|
41
|
+
image.width >= 50 && image.height >= 50 && image.width.to_f / image.height.to_f <= 3 && image.height.to_f / image.width.to_f <= 3
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'thumbnail_scraper/webpage'
|
2
|
+
require 'httpclient'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
class HttpReceiver
|
6
|
+
attr_accessor :http_client
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@http_client = ::HTTPClient.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def receive_webpage(url)
|
13
|
+
content = http_client.get_content(url)
|
14
|
+
Webpage.new(url, content)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'thumbnail_scraper/with_smart_url'
|
2
|
+
require 'fastimage'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
class Image
|
6
|
+
include WithSmartUrl
|
7
|
+
|
8
|
+
def initialize(url)
|
9
|
+
self.url = url
|
10
|
+
end
|
11
|
+
|
12
|
+
def size
|
13
|
+
@size ||= ::FastImage.size(url.to_s)
|
14
|
+
end
|
15
|
+
|
16
|
+
def width
|
17
|
+
size[0]
|
18
|
+
end
|
19
|
+
|
20
|
+
def height
|
21
|
+
size[1]
|
22
|
+
end
|
23
|
+
|
24
|
+
def area
|
25
|
+
width * height
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'thumbnail_scraper/with_smart_url'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
class Webpage
|
6
|
+
attr_accessor :body
|
7
|
+
|
8
|
+
include WithSmartUrl
|
9
|
+
|
10
|
+
def initialize(url, body)
|
11
|
+
self.url = url
|
12
|
+
self.body = body
|
13
|
+
end
|
14
|
+
|
15
|
+
def document
|
16
|
+
Nokogiri::HTML::Document.parse(body)
|
17
|
+
end
|
18
|
+
|
19
|
+
def image_url(image_path)
|
20
|
+
if image_path.start_with?("http://") || image_path.start_with?("https://")
|
21
|
+
image_url = URI(image_path)
|
22
|
+
elsif image_path.start_with?("//")
|
23
|
+
image_url = URI(image_path)
|
24
|
+
image_url.scheme = url.scheme
|
25
|
+
else
|
26
|
+
image_url = URI(url.to_s)
|
27
|
+
if Pathname.new(image_path).absolute?
|
28
|
+
image_url.path = image_path
|
29
|
+
else
|
30
|
+
image_url.path = File.expand_path(File.join(File.dirname(url.path), image_path))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
image_url
|
34
|
+
end
|
35
|
+
|
36
|
+
def open_graph_image_url
|
37
|
+
return @open_graph_image_url if defined?(@open_graph_image_url)
|
38
|
+
elements = document.xpath("//meta[@property='og:image']/@content")
|
39
|
+
return nil if elements.empty?
|
40
|
+
image_path = elements.first.value
|
41
|
+
@open_graph_image_url = image_url(image_path)
|
42
|
+
end
|
43
|
+
|
44
|
+
def has_open_graph_image?
|
45
|
+
!open_graph_image_url.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
def linked_image_url
|
49
|
+
return @linked_image_url if defined?(@linked_image_url)
|
50
|
+
elements = document.xpath("//link[@rel='img_src']/@href")
|
51
|
+
return nil if elements.empty?
|
52
|
+
image_path = elements.first.value
|
53
|
+
@linked_image_url = image_url(image_path)
|
54
|
+
end
|
55
|
+
|
56
|
+
def has_linked_image?
|
57
|
+
!linked_image_url.nil?
|
58
|
+
end
|
59
|
+
|
60
|
+
def attached_images_urls
|
61
|
+
elements = document.xpath("//img/@src")
|
62
|
+
elements.map{|element| image_url(element.value)}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'thumbnail_scraper_helper'
|
2
|
+
require 'thumbnail_scraper'
|
3
|
+
|
4
|
+
describe ThumbnailScraper::ThumbnailScraper do
|
5
|
+
before :each do
|
6
|
+
@thumbnail_scraper = ThumbnailScraper::ThumbnailScraper.new
|
7
|
+
end
|
8
|
+
|
9
|
+
context "for page with og:image" do
|
10
|
+
before :each do
|
11
|
+
stub_request(:get, "www.example.com/og_image.html").to_return(:body => asset_file("og_image.html"))
|
12
|
+
stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#image_to_thumbnail_for_url" do
|
16
|
+
it "should be found og:image" do
|
17
|
+
image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/og_image.html")
|
18
|
+
image_to_thumbnail.url.to_s.should == "http://www.example.com/images/kitty.jpg"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
context "for page with link img_src" do
|
24
|
+
before :each do
|
25
|
+
stub_request(:get, "www.example.com/img_src.html").to_return(:body => asset_file("img_src.html"))
|
26
|
+
stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#image_to_thumbnail_for_url" do
|
30
|
+
it "should be found link img_src" do
|
31
|
+
image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/img_src.html")
|
32
|
+
image_to_thumbnail.url.to_s.should == "http://www.example.com/images/kitty.jpg"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "for page with images" do
|
38
|
+
before :each do
|
39
|
+
stub_request(:get, "www.example.com/images.html").to_return(:body => asset_file("images.html"))
|
40
|
+
stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
|
41
|
+
stub_request(:get, "www.example.com/images/biggest.jpg").to_return(:body => asset_file(File.join("images", "biggest.jpg")))
|
42
|
+
stub_request(:get, "www.example.com/images/subcatalog/smallest.jpg").to_return(:body => asset_file(File.join("images", "subcatalog", "smallest.jpg")))
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#image_to_thumbnail_for_url" do
|
46
|
+
it "should be biggest image from page" do
|
47
|
+
image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/images.html")
|
48
|
+
image_to_thumbnail.url.to_s.should == "http://www.example.com/images/biggest.jpg"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'thumbnail_scraper_helper'
|
2
|
+
require 'thumbnail_scraper/http_receiver'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
describe HttpReceiver do
|
6
|
+
before :each do
|
7
|
+
@http_receiver = HttpReceiver.new
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "#receive_webpage" do
|
11
|
+
before :each do
|
12
|
+
stub_request(:get, "www.example.com/index.html").to_return(:body => "<p>I'm a webpage</p>")
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should set Webpage#body" do
|
16
|
+
@http_receiver.receive_webpage("http://www.example.com/index.html").body == "<p>I'm a webpage</p>"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should set Webpage#uri" do
|
20
|
+
@http_receiver.receive_webpage("http://www.example.com/index.html").url.should == URI.parse("http://www.example.com/index.html")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'thumbnail_scraper_helper'
|
2
|
+
require 'thumbnail_scraper'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
describe ThumbnailScraper do
|
6
|
+
before :each do
|
7
|
+
@http_receiver = mock("http receiver mock")
|
8
|
+
@thumbnail_scraper = ThumbnailScraper.new(@http_receiver)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "#image_to_thumbnail_for_url" do
|
12
|
+
context "for webpage with open graph image" do
|
13
|
+
before :each do
|
14
|
+
@page_url = "http://www.example.com/index.html"
|
15
|
+
@image_url = "http://www.example.com/images/image.png"
|
16
|
+
@webpage = mock("webpage")
|
17
|
+
@image = mock("image")
|
18
|
+
@webpage.stub!(:open_graph_image_url).and_return(@image_url)
|
19
|
+
@thumbnail_scraper.stub!(:create_image).with(@image_url).and_return(@image)
|
20
|
+
@webpage.stub!(:has_open_graph_image?).and_return(true)
|
21
|
+
@http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return image" do
|
25
|
+
@thumbnail_scraper.image_to_thumbnail_for_url(@page_url).should == @image
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "for webpage with link img_src" do
|
30
|
+
before :each do
|
31
|
+
@page_url = "http://www.example.com/index.html"
|
32
|
+
@image_url = "http://www.example.com/images/image.png"
|
33
|
+
@webpage = mock("webpage")
|
34
|
+
@image = mock("image")
|
35
|
+
@webpage.stub!(:has_open_graph_image?).and_return(false)
|
36
|
+
@webpage.stub!(:linked_image_url).and_return(@image_url)
|
37
|
+
@webpage.stub!(:has_linked_image?).and_return(true)
|
38
|
+
@thumbnail_scraper.stub!(:create_image).with(@image_url).and_return(@image)
|
39
|
+
@http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should return image" do
|
43
|
+
@thumbnail_scraper.image_to_thumbnail_for_url(@page_url).should == @image
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "for webpage with only images" do
|
48
|
+
before :each do
|
49
|
+
@page_url = "http://www.example.com/index.html"
|
50
|
+
@webpage = mock("webpage")
|
51
|
+
@webpage.stub!(:has_open_graph_image?).and_return(false)
|
52
|
+
@webpage.stub!(:has_linked_image?).and_return(false)
|
53
|
+
@webpage.stub!(:attached_images_urls).and_return([:first, :second, :third])
|
54
|
+
@thumbnail_scraper.stub!(:create_image).and_return(:image)
|
55
|
+
@http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
|
56
|
+
@http_receiver.stub!(:receive_image).with(:image).and_return(:image_content)
|
57
|
+
@thumbnail_scraper.stub!(:select_best_possible_image_to_scrap).and_return(:image)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should use Website#attached_images_urls to get all images" do
|
61
|
+
@webpage.should_receive(:attached_images_urls).and_return([:first, :second, :third])
|
62
|
+
@thumbnail_scraper.image_to_thumbnail_for_url(@page_url)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should select_best_possible_image_to_scrap" do
|
66
|
+
@thumbnail_scraper.should_receive(:select_best_possible_image_to_scrap).with([:first, :second, :third]).and_return(:image)
|
67
|
+
@thumbnail_scraper.image_to_thumbnail_for_url(@page_url)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "#select_best_possible_image_to_scrap" do
|
73
|
+
before :each do
|
74
|
+
@image1 = mock("image")
|
75
|
+
@image1.stub!(:area).and_return(250)
|
76
|
+
@image2 = mock("image")
|
77
|
+
@image2.stub!(:area).and_return(1000)
|
78
|
+
@images_urls = [:first, :second]
|
79
|
+
@thumbnail_scraper.stub!(:create_image).with(:first).and_return(@image1)
|
80
|
+
@thumbnail_scraper.stub!(:create_image).with(:second).and_return(@image2)
|
81
|
+
@thumbnail_scraper.stub!(:select_valid_images).with([@image1, @image2]).and_return([@image1, @image2])
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should create all images" do
|
85
|
+
@thumbnail_scraper.should_receive(:create_image).with(:first).and_return(@image1)
|
86
|
+
@thumbnail_scraper.should_receive(:create_image).with(:second).and_return(@image2)
|
87
|
+
@thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls)
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should filter out invalid images" do
|
91
|
+
@thumbnail_scraper.should_receive(:select_valid_images).with([@image1, @image2]).and_return([@image1, @image2])
|
92
|
+
@thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should be biggest (by area) image of valid ones" do
|
96
|
+
@thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls).should == @image2
|
97
|
+
end
|
98
|
+
|
99
|
+
context "for all invalid images" do
|
100
|
+
it "should be nil" do
|
101
|
+
@thumbnail_scraper.should_receive(:select_valid_images).and_return([])
|
102
|
+
@thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls).should be_nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "#select_valid_images" do
|
108
|
+
before :each do
|
109
|
+
@image1 = mock("image")
|
110
|
+
@image2 = mock("image")
|
111
|
+
@thumbnail_scraper.stub!(:image_is_valid?).with(@image1).and_return(true)
|
112
|
+
@thumbnail_scraper.stub!(:image_is_valid?).with(@image2).and_return(false)
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should use #image_is_valid? to validate image" do
|
116
|
+
@thumbnail_scraper.stub!(:image_is_valid?).with(@image1).and_return(true)
|
117
|
+
@thumbnail_scraper.stub!(:image_is_valid?).with(@image2).and_return(false)
|
118
|
+
@thumbnail_scraper.select_valid_images([@image1, @image2])
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should select only valid images" do
|
122
|
+
@thumbnail_scraper.select_valid_images([@image1, @image2]).should == [@image1]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
describe "#image_is_valid?" do
|
127
|
+
before :each do
|
128
|
+
@image = mock("image")
|
129
|
+
@image.stub!(:width).and_return(100)
|
130
|
+
@image.stub!(:height).and_return(100)
|
131
|
+
end
|
132
|
+
|
133
|
+
context "for small image width" do
|
134
|
+
before :each do
|
135
|
+
@image.stub!(:width).and_return(49)
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should be false" do
|
139
|
+
@thumbnail_scraper.image_is_valid?(@image).should be_false
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
context "for small image height" do
|
144
|
+
before :each do
|
145
|
+
@image.stub!(:height).and_return(49)
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should be false" do
|
149
|
+
@thumbnail_scraper.image_is_valid?(@image).should be_false
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
context "width to height ratio bigger than 3:1" do
|
154
|
+
before :each do
|
155
|
+
@image.stub!(:width).and_return(301)
|
156
|
+
@image.stub!(:height).and_return(100)
|
157
|
+
end
|
158
|
+
|
159
|
+
it "should be false" do
|
160
|
+
@thumbnail_scraper.image_is_valid?(@image).should be_false
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
context "height to width ratio bigger than 3:1" do
|
165
|
+
before :each do
|
166
|
+
@image.stub!(:width).and_return(100)
|
167
|
+
@image.stub!(:height).and_return(301)
|
168
|
+
end
|
169
|
+
|
170
|
+
it "should be false" do
|
171
|
+
@thumbnail_scraper.image_is_valid?(@image).should be_false
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
context "otherwise" do
|
176
|
+
it "should be true" do
|
177
|
+
@thumbnail_scraper.image_is_valid?(@image).should be_true
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'thumbnail_scraper_helper'
|
2
|
+
require 'thumbnail_scraper/webpage'
|
3
|
+
|
4
|
+
module ThumbnailScraper
|
5
|
+
describe Webpage do
|
6
|
+
before :each do
|
7
|
+
@url = "http://www.example.com/site/site.html"
|
8
|
+
@webpage = Webpage.new(@url, "")
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "#image_url" do
|
12
|
+
context "for relative path" do
|
13
|
+
it "should be with absolute path based on page url and image relative path" do
|
14
|
+
@webpage.image_url("images/kitty.jpg").path.should == "/site/images/kitty.jpg"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should have same host as page" do
|
18
|
+
@webpage.image_url("images/kitty.jpg").host.should == "www.example.com"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "for absolute path" do
|
23
|
+
it "should be with absolute path based on image path" do
|
24
|
+
@webpage.image_url("/images/kitty.jpg").path.should == "/images/kitty.jpg"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have same host as page" do
|
28
|
+
@webpage.image_url("/images/kitty.jpg").host.should == "www.example.com"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "for full uri" do
|
33
|
+
it "should be with absolute path based on image path" do
|
34
|
+
@webpage.image_url("http://outersite.com/images/kitty.jpg").path.should == "/images/kitty.jpg"
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should have same host as page" do
|
38
|
+
@webpage.image_url("http://outersite.com/images/kitty.jpg").host.should == "outersite.com"
|
39
|
+
end
|
40
|
+
|
41
|
+
context "for https address" do
|
42
|
+
it "should have image's host" do
|
43
|
+
@webpage.image_url("https://outersite.com/images/kitty.jpg").host.should == "outersite.com"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "for shortcut address" do
|
48
|
+
it "should have image's host" do
|
49
|
+
@webpage.image_url("//outersite.com/images/kitty.jpg").host.should == "outersite.com"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should have same scheme as page" do
|
53
|
+
@webpage.image_url("//outersite.com/images/kitty.jpg").scheme.should == "http"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "#open_graph_image_url" do
|
60
|
+
context "for document with og:image" do
|
61
|
+
before :each do
|
62
|
+
@webpage.body = File.read(asset_file_path("og_image.html"))
|
63
|
+
@webpage.stub!(:image_url).and_return(:image_url)
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should use #image_url to construct final image url" do
|
67
|
+
@webpage.should_receive(:image_url).with("images/kitty.jpg").and_return(:image_url)
|
68
|
+
@webpage.open_graph_image_url
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should return url constructed by image_url" do
|
72
|
+
@webpage.open_graph_image_url.should == :image_url
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "for document without og:image" do
|
77
|
+
before :each do
|
78
|
+
@webpage.body = File.read(asset_file_path("img_src.html"))
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should be nil" do
|
82
|
+
@webpage.open_graph_image_url.should be_nil
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "#has_open_graph_image" do
|
88
|
+
context "for nil #open_graph_image_url" do
|
89
|
+
before :each do
|
90
|
+
@webpage.stub!(:open_graph_image_url).and_return(nil)
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should be false" do
|
94
|
+
@webpage.has_open_graph_image?.should be_false
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
context "for not nil #open_graph_image_url" do
|
99
|
+
before :each do
|
100
|
+
@webpage.stub!(:open_graph_image_url).and_return(:image_url)
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should be true" do
|
104
|
+
@webpage.has_open_graph_image?.should be_true
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
describe "#linked_image_url" do
|
110
|
+
context "for document with link img_src" do
|
111
|
+
before :each do
|
112
|
+
@webpage.body = File.read(asset_file_path("img_src.html"))
|
113
|
+
@webpage.stub!(:image_url).and_return(:image_url)
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should use #image_url to construct final image url" do
|
117
|
+
@webpage.should_receive(:image_url).with("images/kitty.jpg").and_return(:image_url)
|
118
|
+
@webpage.linked_image_url
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should return url constructed by image_url" do
|
122
|
+
@webpage.linked_image_url.should == :image_url
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context "for document without link img_src" do
|
127
|
+
before :each do
|
128
|
+
@webpage.body = File.read(asset_file_path("og_image.html"))
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should be nil" do
|
132
|
+
@webpage.linked_image_url.should be_nil
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
describe "#has_linked_image" do
|
138
|
+
context "for nil #linked_image_url" do
|
139
|
+
before :each do
|
140
|
+
@webpage.stub!(:linked_image_url).and_return(nil)
|
141
|
+
end
|
142
|
+
|
143
|
+
it "should be false" do
|
144
|
+
@webpage.has_linked_image?.should be_false
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
context "for not nil #linked_image_url" do
|
149
|
+
before :each do
|
150
|
+
@webpage.stub!(:linked_image_url).and_return(:image_url)
|
151
|
+
end
|
152
|
+
|
153
|
+
it "should be true" do
|
154
|
+
@webpage.has_linked_image?.should be_true
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
describe "#attached_images_urls" do
|
160
|
+
context "with images in body" do
|
161
|
+
before :each do
|
162
|
+
@webpage.body = File.read(asset_file_path("images.html"))
|
163
|
+
@webpage.stub!(:image_url).with("/images/subcatalog/smallest.jpg").and_return(:smallest_url)
|
164
|
+
@webpage.stub!(:image_url).with("/images/kitty.jpg").and_return(:kitty_url)
|
165
|
+
@webpage.stub!(:image_url).with("images/biggest.jpg").and_return(:biggest_url)
|
166
|
+
end
|
167
|
+
|
168
|
+
it "should return all urls of found images" do
|
169
|
+
urls = @webpage.attached_images_urls
|
170
|
+
urls.should include(:smallest_url)
|
171
|
+
urls.should include(:kitty_url)
|
172
|
+
urls.should include(:biggest_url)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/thumbnail_scraper/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jeppe Liisberg", "Jan Filipowski"]
|
6
|
+
gem.email = ["jachuf@gmail.com"]
|
7
|
+
gem.description = %q{detect, fetch and generate a thumbnail for any url and store it on s3}
|
8
|
+
gem.summary = %q{detect, fetch and generate a thumbnail for any url and store it on s3}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.add_development_dependency "rspec"
|
12
|
+
gem.add_development_dependency "webmock"
|
13
|
+
gem.add_runtime_dependency "httpclient"
|
14
|
+
gem.add_runtime_dependency "fastimage"
|
15
|
+
gem.add_runtime_dependency "nokogiri"
|
16
|
+
|
17
|
+
gem.files = `git ls-files`.split($\)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
19
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
20
|
+
gem.name = "thumbnail_scraper"
|
21
|
+
gem.require_paths = ["lib"]
|
22
|
+
gem.version = ThumbnailScraper::VERSION
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: thumbnail_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jeppe Liisberg
|
9
|
+
- Jan Filipowski
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-07-30 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
none: false
|
27
|
+
requirements:
|
28
|
+
- - ! '>='
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '0'
|
31
|
+
- !ruby/object:Gem::Dependency
|
32
|
+
name: webmock
|
33
|
+
requirement: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
type: :development
|
40
|
+
prerelease: false
|
41
|
+
version_requirements: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: httpclient
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: fastimage
|
65
|
+
requirement: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ! '>='
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
type: :runtime
|
72
|
+
prerelease: false
|
73
|
+
version_requirements: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
76
|
+
- - ! '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: nokogiri
|
81
|
+
requirement: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ! '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
type: :runtime
|
88
|
+
prerelease: false
|
89
|
+
version_requirements: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
description: detect, fetch and generate a thumbnail for any url and store it on s3
|
96
|
+
email:
|
97
|
+
- jachuf@gmail.com
|
98
|
+
executables: []
|
99
|
+
extensions: []
|
100
|
+
extra_rdoc_files: []
|
101
|
+
files:
|
102
|
+
- .gitignore
|
103
|
+
- Gemfile
|
104
|
+
- Gemfile.lock
|
105
|
+
- LICENSE
|
106
|
+
- README.md
|
107
|
+
- Rakefile
|
108
|
+
- lib/thumbnail_scraper.rb
|
109
|
+
- lib/thumbnail_scraper/http_receiver.rb
|
110
|
+
- lib/thumbnail_scraper/image.rb
|
111
|
+
- lib/thumbnail_scraper/version.rb
|
112
|
+
- lib/thumbnail_scraper/webpage.rb
|
113
|
+
- lib/thumbnail_scraper/with_smart_url.rb
|
114
|
+
- spec/sample_pages/images.html
|
115
|
+
- spec/sample_pages/images/biggest.jpg
|
116
|
+
- spec/sample_pages/images/kitty.jpg
|
117
|
+
- spec/sample_pages/images/subcatalog/smallest.jpg
|
118
|
+
- spec/sample_pages/img_src.html
|
119
|
+
- spec/sample_pages/og_image.html
|
120
|
+
- spec/thumbnail_scraper_helper.rb
|
121
|
+
- spec/thumbnail_scraper_spec.rb
|
122
|
+
- spec/units/http_receiver_spec.rb
|
123
|
+
- spec/units/thumbnail_scraper_spec.rb
|
124
|
+
- spec/units/webpage_spec.rb
|
125
|
+
- thumbnail_scraper.gemspec
|
126
|
+
homepage: ''
|
127
|
+
licenses: []
|
128
|
+
post_install_message:
|
129
|
+
rdoc_options: []
|
130
|
+
require_paths:
|
131
|
+
- lib
|
132
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
+
none: false
|
134
|
+
requirements:
|
135
|
+
- - ! '>='
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '0'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ! '>='
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 1.8.24
|
147
|
+
signing_key:
|
148
|
+
specification_version: 3
|
149
|
+
summary: detect, fetch and generate a thumbnail for any url and store it on s3
|
150
|
+
test_files:
|
151
|
+
- spec/sample_pages/images.html
|
152
|
+
- spec/sample_pages/images/biggest.jpg
|
153
|
+
- spec/sample_pages/images/kitty.jpg
|
154
|
+
- spec/sample_pages/images/subcatalog/smallest.jpg
|
155
|
+
- spec/sample_pages/img_src.html
|
156
|
+
- spec/sample_pages/og_image.html
|
157
|
+
- spec/thumbnail_scraper_helper.rb
|
158
|
+
- spec/thumbnail_scraper_spec.rb
|
159
|
+
- spec/units/http_receiver_spec.rb
|
160
|
+
- spec/units/thumbnail_scraper_spec.rb
|
161
|
+
- spec/units/webpage_spec.rb
|