thumbnail_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ *.rvmrc
4
+ .bundle
5
+ .config
6
+ coverage
7
+ InstalledFiles
8
+ lib/bundler/man
9
+ pkg
10
+ rdoc
11
+ spec/reports
12
+ test/tmp
13
+ test/version_tmp
14
+ tmp
15
+ *.swp
16
+
17
+ # YARD artifacts
18
+ .yardoc
19
+ _yardoc
20
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in thumbnail_scraper.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ thumbnail_scraper (0.0.1)
5
+ fastimage
6
+ httpclient
7
+ nokogiri
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ addressable (2.2.8)
13
+ crack (0.3.1)
14
+ diff-lcs (1.1.3)
15
+ fastimage (1.2.13)
16
+ httpclient (2.2.5)
17
+ nokogiri (1.5.3)
18
+ rspec (2.11.0)
19
+ rspec-core (~> 2.11.0)
20
+ rspec-expectations (~> 2.11.0)
21
+ rspec-mocks (~> 2.11.0)
22
+ rspec-core (2.11.1)
23
+ rspec-expectations (2.11.1)
24
+ diff-lcs (~> 1.1.3)
25
+ rspec-mocks (2.11.1)
26
+ webmock (1.7.6)
27
+ addressable (~> 2.2, > 2.2.5)
28
+ crack (>= 0.1.7)
29
+
30
+ PLATFORMS
31
+ ruby
32
+
33
+ DEPENDENCIES
34
+ rspec
35
+ thumbnail_scraper!
36
+ webmock
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jan Filipowski
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ Thumbnail Scraper
2
+ ============
3
+
4
+ detect, fetch and generate a thumbnail for any url
5
+
6
+ Basic usage
7
+ -------------
8
+ ```ruby
9
+ require 'thumbnail_scraper'
10
+
11
+ include ThumbnailScraper
12
+ scraper = ThumbnailScraper.new
13
+ image = scraper.image_to_thumbnail_url("http://www.monibuds.com/")
14
+ thumbnail_url = image.url
15
+ ```
16
+
17
+ ThumbnailScraper#image_to_thumbnail_url method returns Image object, which contains its size and url.
18
+
19
+ Suggested usage
20
+ ---------------
21
+
22
+ We encourage you to use it with delayed_job as jobs queue and dragonfly as image storage tool. Your job could look like following:
23
+
24
+ ```ruby
25
+ require 'thumbnail_scraper'
26
+
27
+ module Jobs
28
+ class ScrapThumbnailJob < Struct.new(:page)
29
+ def perform
30
+ scraper = ::ThumbnailScraper::ThumbnailScraper.new
31
+ image = scraper.image_to_thumbnail_for_url(page.url)
32
+ page.thumbnail_url = image.url.to_s
33
+ page.save!
34
+ end
35
+ end
36
+ end
37
+ ```
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require "rspec/core/rake_task"
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,44 @@
1
+ require "thumbnail_scraper/version"
2
+ require 'thumbnail_scraper/image'
3
+ require 'thumbnail_scraper/http_receiver'
4
+
5
+ module ThumbnailScraper
6
+ class ThumbnailScraper
7
+ attr_accessor :http_receiver
8
+
9
+ def initialize(receiver=HttpReceiver.new)
10
+ @http_receiver = receiver
11
+ end
12
+
13
+ def create_image(url)
14
+ Image.new(url)
15
+ end
16
+
17
+ def image_to_thumbnail_for_url(url)
18
+ webpage = http_receiver.receive_webpage(url)
19
+ if webpage.has_open_graph_image?
20
+ image = create_image(webpage.open_graph_image_url)
21
+ elsif webpage.has_linked_image?
22
+ image = create_image(webpage.linked_image_url)
23
+ else
24
+ image = select_best_possible_image_to_scrap(webpage.attached_images_urls)
25
+ end
26
+ image
27
+ end
28
+
29
+ def select_best_possible_image_to_scrap(images_urls)
30
+ images = images_urls.map{|image_url| create_image(image_url)}
31
+ valid_images = select_valid_images(images)
32
+ return nil if valid_images.empty?
33
+ valid_images.max{|a, b| a.area <=> b.area}
34
+ end
35
+
36
+ def select_valid_images(images)
37
+ images.select{|image| image_is_valid?(image)}
38
+ end
39
+
40
+ def image_is_valid?(image)
41
+ image.width >= 50 && image.height >= 50 && image.width.to_f / image.height.to_f <= 3 && image.height.to_f / image.width.to_f <= 3
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ require 'thumbnail_scraper/webpage'
2
+ require 'httpclient'
3
+
4
+ module ThumbnailScraper
5
+ class HttpReceiver
6
+ attr_accessor :http_client
7
+
8
+ def initialize
9
+ @http_client = ::HTTPClient.new
10
+ end
11
+
12
+ def receive_webpage(url)
13
+ content = http_client.get_content(url)
14
+ Webpage.new(url, content)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,28 @@
1
+ require 'thumbnail_scraper/with_smart_url'
2
+ require 'fastimage'
3
+
4
+ module ThumbnailScraper
5
+ class Image
6
+ include WithSmartUrl
7
+
8
+ def initialize(url)
9
+ self.url = url
10
+ end
11
+
12
+ def size
13
+ @size ||= ::FastImage.size(url.to_s)
14
+ end
15
+
16
+ def width
17
+ size[0]
18
+ end
19
+
20
+ def height
21
+ size[1]
22
+ end
23
+
24
+ def area
25
+ width * height
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ module ThumbnailScraper
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,65 @@
1
+ require 'thumbnail_scraper/with_smart_url'
2
+ require 'nokogiri'
3
+
4
+ module ThumbnailScraper
5
+ class Webpage
6
+ attr_accessor :body
7
+
8
+ include WithSmartUrl
9
+
10
+ def initialize(url, body)
11
+ self.url = url
12
+ self.body = body
13
+ end
14
+
15
+ def document
16
+ Nokogiri::HTML::Document.parse(body)
17
+ end
18
+
19
+ def image_url(image_path)
20
+ if image_path.start_with?("http://") || image_path.start_with?("https://")
21
+ image_url = URI(image_path)
22
+ elsif image_path.start_with?("//")
23
+ image_url = URI(image_path)
24
+ image_url.scheme = url.scheme
25
+ else
26
+ image_url = URI(url.to_s)
27
+ if Pathname.new(image_path).absolute?
28
+ image_url.path = image_path
29
+ else
30
+ image_url.path = File.expand_path(File.join(File.dirname(url.path), image_path))
31
+ end
32
+ end
33
+ image_url
34
+ end
35
+
36
+ def open_graph_image_url
37
+ return @open_graph_image_url if defined?(@open_graph_image_url)
38
+ elements = document.xpath("//meta[@property='og:image']/@content")
39
+ return nil if elements.empty?
40
+ image_path = elements.first.value
41
+ @open_graph_image_url = image_url(image_path)
42
+ end
43
+
44
+ def has_open_graph_image?
45
+ !open_graph_image_url.nil?
46
+ end
47
+
48
+ def linked_image_url
49
+ return @linked_image_url if defined?(@linked_image_url)
50
+ elements = document.xpath("//link[@rel='img_src']/@href")
51
+ return nil if elements.empty?
52
+ image_path = elements.first.value
53
+ @linked_image_url = image_url(image_path)
54
+ end
55
+
56
+ def has_linked_image?
57
+ !linked_image_url.nil?
58
+ end
59
+
60
+ def attached_images_urls
61
+ elements = document.xpath("//img/@src")
62
+ elements.map{|element| image_url(element.value)}
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,17 @@
1
+ require 'uri'
2
+
3
+ module ThumbnailScraper
4
+ module WithSmartUrl
5
+ def url=(value)
6
+ if value.is_a?(URI)
7
+ @url = value
8
+ else
9
+ @url = URI.parse(value)
10
+ end
11
+ end
12
+
13
+ def url
14
+ @url
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <p><img src="/images/subcatalog/smallest.jpg" /></p>
6
+ <p>Sample</p>
7
+ <p><img src="/images/kitty.jpg" /></p>
8
+ <p>Sample</p>
9
+ <p><img src="images/biggest.jpg" /></p>
10
+ </body>
11
+ </html>
12
+
Binary file
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <link rel="img_src" href="images/kitty.jpg" />
4
+ </head>
5
+ <body>
6
+ <p>Sample</p>
7
+ </body>
8
+ </html>
9
+
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <meta property="og:image" content="images/kitty.jpg" />
4
+ </head>
5
+ <body>
6
+ <p>Sample</p>
7
+ </body>
8
+ </html>
@@ -0,0 +1,11 @@
1
+ require 'webmock/rspec'
2
+
3
+ def asset_file_path(name)
4
+ File.join("spec", "sample_pages", name)
5
+ end
6
+
7
+ def asset_file(name)
8
+ path = asset_file_path(name)
9
+ File.new(path)
10
+ end
11
+
@@ -0,0 +1,52 @@
1
+ require 'thumbnail_scraper_helper'
2
+ require 'thumbnail_scraper'
3
+
4
+ describe ThumbnailScraper::ThumbnailScraper do
5
+ before :each do
6
+ @thumbnail_scraper = ThumbnailScraper::ThumbnailScraper.new
7
+ end
8
+
9
+ context "for page with og:image" do
10
+ before :each do
11
+ stub_request(:get, "www.example.com/og_image.html").to_return(:body => asset_file("og_image.html"))
12
+ stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
13
+ end
14
+
15
+ describe "#image_to_thumbnail_for_url" do
16
+ it "should be found og:image" do
17
+ image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/og_image.html")
18
+ image_to_thumbnail.url.to_s.should == "http://www.example.com/images/kitty.jpg"
19
+ end
20
+ end
21
+ end
22
+
23
+ context "for page with link img_src" do
24
+ before :each do
25
+ stub_request(:get, "www.example.com/img_src.html").to_return(:body => asset_file("img_src.html"))
26
+ stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
27
+ end
28
+
29
+ describe "#image_to_thumbnail_for_url" do
30
+ it "should be found link img_src" do
31
+ image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/img_src.html")
32
+ image_to_thumbnail.url.to_s.should == "http://www.example.com/images/kitty.jpg"
33
+ end
34
+ end
35
+ end
36
+
37
+ context "for page with images" do
38
+ before :each do
39
+ stub_request(:get, "www.example.com/images.html").to_return(:body => asset_file("images.html"))
40
+ stub_request(:get, "www.example.com/images/kitty.jpg").to_return(:body => asset_file(File.join("images", "kitty.jpg")))
41
+ stub_request(:get, "www.example.com/images/biggest.jpg").to_return(:body => asset_file(File.join("images", "biggest.jpg")))
42
+ stub_request(:get, "www.example.com/images/subcatalog/smallest.jpg").to_return(:body => asset_file(File.join("images", "subcatalog", "smallest.jpg")))
43
+ end
44
+
45
+ describe "#image_to_thumbnail_for_url" do
46
+ it "should be biggest image from page" do
47
+ image_to_thumbnail = @thumbnail_scraper.image_to_thumbnail_for_url("http://www.example.com/images.html")
48
+ image_to_thumbnail.url.to_s.should == "http://www.example.com/images/biggest.jpg"
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,24 @@
1
+ require 'thumbnail_scraper_helper'
2
+ require 'thumbnail_scraper/http_receiver'
3
+
4
+ module ThumbnailScraper
5
+ describe HttpReceiver do
6
+ before :each do
7
+ @http_receiver = HttpReceiver.new
8
+ end
9
+
10
+ describe "#receive_webpage" do
11
+ before :each do
12
+ stub_request(:get, "www.example.com/index.html").to_return(:body => "<p>I'm a webpage</p>")
13
+ end
14
+
15
+ it "should set Webpage#body" do
16
+ @http_receiver.receive_webpage("http://www.example.com/index.html").body == "<p>I'm a webpage</p>"
17
+ end
18
+
19
+ it "should set Webpage#uri" do
20
+ @http_receiver.receive_webpage("http://www.example.com/index.html").url.should == URI.parse("http://www.example.com/index.html")
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,182 @@
1
+ require 'thumbnail_scraper_helper'
2
+ require 'thumbnail_scraper'
3
+
4
+ module ThumbnailScraper
5
+ describe ThumbnailScraper do
6
+ before :each do
7
+ @http_receiver = mock("http receiver mock")
8
+ @thumbnail_scraper = ThumbnailScraper.new(@http_receiver)
9
+ end
10
+
11
+ describe "#image_to_thumbnail_for_url" do
12
+ context "for webpage with open graph image" do
13
+ before :each do
14
+ @page_url = "http://www.example.com/index.html"
15
+ @image_url = "http://www.example.com/images/image.png"
16
+ @webpage = mock("webpage")
17
+ @image = mock("image")
18
+ @webpage.stub!(:open_graph_image_url).and_return(@image_url)
19
+ @thumbnail_scraper.stub!(:create_image).with(@image_url).and_return(@image)
20
+ @webpage.stub!(:has_open_graph_image?).and_return(true)
21
+ @http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
22
+ end
23
+
24
+ it "should return image" do
25
+ @thumbnail_scraper.image_to_thumbnail_for_url(@page_url).should == @image
26
+ end
27
+ end
28
+
29
+ context "for webpage with link img_src" do
30
+ before :each do
31
+ @page_url = "http://www.example.com/index.html"
32
+ @image_url = "http://www.example.com/images/image.png"
33
+ @webpage = mock("webpage")
34
+ @image = mock("image")
35
+ @webpage.stub!(:has_open_graph_image?).and_return(false)
36
+ @webpage.stub!(:linked_image_url).and_return(@image_url)
37
+ @webpage.stub!(:has_linked_image?).and_return(true)
38
+ @thumbnail_scraper.stub!(:create_image).with(@image_url).and_return(@image)
39
+ @http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
40
+ end
41
+
42
+ it "should return image" do
43
+ @thumbnail_scraper.image_to_thumbnail_for_url(@page_url).should == @image
44
+ end
45
+ end
46
+
47
+ context "for webpage with only images" do
48
+ before :each do
49
+ @page_url = "http://www.example.com/index.html"
50
+ @webpage = mock("webpage")
51
+ @webpage.stub!(:has_open_graph_image?).and_return(false)
52
+ @webpage.stub!(:has_linked_image?).and_return(false)
53
+ @webpage.stub!(:attached_images_urls).and_return([:first, :second, :third])
54
+ @thumbnail_scraper.stub!(:create_image).and_return(:image)
55
+ @http_receiver.stub!(:receive_webpage).with(@page_url).and_return(@webpage)
56
+ @http_receiver.stub!(:receive_image).with(:image).and_return(:image_content)
57
+ @thumbnail_scraper.stub!(:select_best_possible_image_to_scrap).and_return(:image)
58
+ end
59
+
60
+ it "should use Website#attached_images_urls to get all images" do
61
+ @webpage.should_receive(:attached_images_urls).and_return([:first, :second, :third])
62
+ @thumbnail_scraper.image_to_thumbnail_for_url(@page_url)
63
+ end
64
+
65
+ it "should select_best_possible_image_to_scrap" do
66
+ @thumbnail_scraper.should_receive(:select_best_possible_image_to_scrap).with([:first, :second, :third]).and_return(:image)
67
+ @thumbnail_scraper.image_to_thumbnail_for_url(@page_url)
68
+ end
69
+ end
70
+ end
71
+
72
+ describe "#select_best_possible_image_to_scrap" do
73
+ before :each do
74
+ @image1 = mock("image")
75
+ @image1.stub!(:area).and_return(250)
76
+ @image2 = mock("image")
77
+ @image2.stub!(:area).and_return(1000)
78
+ @images_urls = [:first, :second]
79
+ @thumbnail_scraper.stub!(:create_image).with(:first).and_return(@image1)
80
+ @thumbnail_scraper.stub!(:create_image).with(:second).and_return(@image2)
81
+ @thumbnail_scraper.stub!(:select_valid_images).with([@image1, @image2]).and_return([@image1, @image2])
82
+ end
83
+
84
+ it "should create all images" do
85
+ @thumbnail_scraper.should_receive(:create_image).with(:first).and_return(@image1)
86
+ @thumbnail_scraper.should_receive(:create_image).with(:second).and_return(@image2)
87
+ @thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls)
88
+ end
89
+
90
+ it "should filter out invalid images" do
91
+ @thumbnail_scraper.should_receive(:select_valid_images).with([@image1, @image2]).and_return([@image1, @image2])
92
+ @thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls)
93
+ end
94
+
95
+ it "should be biggest (by area) image of valid ones" do
96
+ @thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls).should == @image2
97
+ end
98
+
99
+ context "for all invalid images" do
100
+ it "should be nil" do
101
+ @thumbnail_scraper.should_receive(:select_valid_images).and_return([])
102
+ @thumbnail_scraper.select_best_possible_image_to_scrap(@images_urls).should be_nil
103
+ end
104
+ end
105
+ end
106
+
107
+ describe "#select_valid_images" do
108
+ before :each do
109
+ @image1 = mock("image")
110
+ @image2 = mock("image")
111
+ @thumbnail_scraper.stub!(:image_is_valid?).with(@image1).and_return(true)
112
+ @thumbnail_scraper.stub!(:image_is_valid?).with(@image2).and_return(false)
113
+ end
114
+
115
+ it "should use #image_is_valid? to validate image" do
116
+ @thumbnail_scraper.stub!(:image_is_valid?).with(@image1).and_return(true)
117
+ @thumbnail_scraper.stub!(:image_is_valid?).with(@image2).and_return(false)
118
+ @thumbnail_scraper.select_valid_images([@image1, @image2])
119
+ end
120
+
121
+ it "should select only valid images" do
122
+ @thumbnail_scraper.select_valid_images([@image1, @image2]).should == [@image1]
123
+ end
124
+ end
125
+
126
+ describe "#image_is_valid?" do
127
+ before :each do
128
+ @image = mock("image")
129
+ @image.stub!(:width).and_return(100)
130
+ @image.stub!(:height).and_return(100)
131
+ end
132
+
133
+ context "for small image width" do
134
+ before :each do
135
+ @image.stub!(:width).and_return(49)
136
+ end
137
+
138
+ it "should be false" do
139
+ @thumbnail_scraper.image_is_valid?(@image).should be_false
140
+ end
141
+ end
142
+
143
+ context "for small image height" do
144
+ before :each do
145
+ @image.stub!(:height).and_return(49)
146
+ end
147
+
148
+ it "should be false" do
149
+ @thumbnail_scraper.image_is_valid?(@image).should be_false
150
+ end
151
+ end
152
+
153
+ context "width to height ratio bigger than 3:1" do
154
+ before :each do
155
+ @image.stub!(:width).and_return(301)
156
+ @image.stub!(:height).and_return(100)
157
+ end
158
+
159
+ it "should be false" do
160
+ @thumbnail_scraper.image_is_valid?(@image).should be_false
161
+ end
162
+ end
163
+
164
+ context "height to width ratio bigger than 3:1" do
165
+ before :each do
166
+ @image.stub!(:width).and_return(100)
167
+ @image.stub!(:height).and_return(301)
168
+ end
169
+
170
+ it "should be false" do
171
+ @thumbnail_scraper.image_is_valid?(@image).should be_false
172
+ end
173
+ end
174
+
175
+ context "otherwise" do
176
+ it "should be true" do
177
+ @thumbnail_scraper.image_is_valid?(@image).should be_true
178
+ end
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,177 @@
1
+ require 'thumbnail_scraper_helper'
2
+ require 'thumbnail_scraper/webpage'
3
+
4
+ module ThumbnailScraper
5
+ describe Webpage do
6
+ before :each do
7
+ @url = "http://www.example.com/site/site.html"
8
+ @webpage = Webpage.new(@url, "")
9
+ end
10
+
11
+ describe "#image_url" do
12
+ context "for relative path" do
13
+ it "should be with absolute path based on page url and image relative path" do
14
+ @webpage.image_url("images/kitty.jpg").path.should == "/site/images/kitty.jpg"
15
+ end
16
+
17
+ it "should have same host as page" do
18
+ @webpage.image_url("images/kitty.jpg").host.should == "www.example.com"
19
+ end
20
+ end
21
+
22
+ context "for absolute path" do
23
+ it "should be with absolute path based on image path" do
24
+ @webpage.image_url("/images/kitty.jpg").path.should == "/images/kitty.jpg"
25
+ end
26
+
27
+ it "should have same host as page" do
28
+ @webpage.image_url("/images/kitty.jpg").host.should == "www.example.com"
29
+ end
30
+ end
31
+
32
+ context "for full uri" do
33
+ it "should be with absolute path based on image path" do
34
+ @webpage.image_url("http://outersite.com/images/kitty.jpg").path.should == "/images/kitty.jpg"
35
+ end
36
+
37
+ it "should have same host as page" do
38
+ @webpage.image_url("http://outersite.com/images/kitty.jpg").host.should == "outersite.com"
39
+ end
40
+
41
+ context "for https address" do
42
+ it "should have image's host" do
43
+ @webpage.image_url("https://outersite.com/images/kitty.jpg").host.should == "outersite.com"
44
+ end
45
+ end
46
+
47
+ context "for shortcut address" do
48
+ it "should have image's host" do
49
+ @webpage.image_url("//outersite.com/images/kitty.jpg").host.should == "outersite.com"
50
+ end
51
+
52
+ it "should have same scheme as page" do
53
+ @webpage.image_url("//outersite.com/images/kitty.jpg").scheme.should == "http"
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ describe "#open_graph_image_url" do
60
+ context "for document with og:image" do
61
+ before :each do
62
+ @webpage.body = File.read(asset_file_path("og_image.html"))
63
+ @webpage.stub!(:image_url).and_return(:image_url)
64
+ end
65
+
66
+ it "should use #image_url to construct final image url" do
67
+ @webpage.should_receive(:image_url).with("images/kitty.jpg").and_return(:image_url)
68
+ @webpage.open_graph_image_url
69
+ end
70
+
71
+ it "should return url constructed by image_url" do
72
+ @webpage.open_graph_image_url.should == :image_url
73
+ end
74
+ end
75
+
76
+ context "for document without og:image" do
77
+ before :each do
78
+ @webpage.body = File.read(asset_file_path("img_src.html"))
79
+ end
80
+
81
+ it "should be nil" do
82
+ @webpage.open_graph_image_url.should be_nil
83
+ end
84
+ end
85
+ end
86
+
87
+ describe "#has_open_graph_image" do
88
+ context "for nil #open_graph_image_url" do
89
+ before :each do
90
+ @webpage.stub!(:open_graph_image_url).and_return(nil)
91
+ end
92
+
93
+ it "should be false" do
94
+ @webpage.has_open_graph_image?.should be_false
95
+ end
96
+ end
97
+
98
+ context "for not nil #open_graph_image_url" do
99
+ before :each do
100
+ @webpage.stub!(:open_graph_image_url).and_return(:image_url)
101
+ end
102
+
103
+ it "should be true" do
104
+ @webpage.has_open_graph_image?.should be_true
105
+ end
106
+ end
107
+ end
108
+
109
+ describe "#linked_image_url" do
110
+ context "for document with link img_src" do
111
+ before :each do
112
+ @webpage.body = File.read(asset_file_path("img_src.html"))
113
+ @webpage.stub!(:image_url).and_return(:image_url)
114
+ end
115
+
116
+ it "should use #image_url to construct final image url" do
117
+ @webpage.should_receive(:image_url).with("images/kitty.jpg").and_return(:image_url)
118
+ @webpage.linked_image_url
119
+ end
120
+
121
+ it "should return url constructed by image_url" do
122
+ @webpage.linked_image_url.should == :image_url
123
+ end
124
+ end
125
+
126
+ context "for document without link img_src" do
127
+ before :each do
128
+ @webpage.body = File.read(asset_file_path("og_image.html"))
129
+ end
130
+
131
+ it "should be nil" do
132
+ @webpage.linked_image_url.should be_nil
133
+ end
134
+ end
135
+ end
136
+
137
+ describe "#has_linked_image" do
138
+ context "for nil #linked_image_url" do
139
+ before :each do
140
+ @webpage.stub!(:linked_image_url).and_return(nil)
141
+ end
142
+
143
+ it "should be false" do
144
+ @webpage.has_linked_image?.should be_false
145
+ end
146
+ end
147
+
148
+ context "for not nil #linked_image_url" do
149
+ before :each do
150
+ @webpage.stub!(:linked_image_url).and_return(:image_url)
151
+ end
152
+
153
+ it "should be true" do
154
+ @webpage.has_linked_image?.should be_true
155
+ end
156
+ end
157
+ end
158
+
159
+ describe "#attached_images_urls" do
160
+ context "with images in body" do
161
+ before :each do
162
+ @webpage.body = File.read(asset_file_path("images.html"))
163
+ @webpage.stub!(:image_url).with("/images/subcatalog/smallest.jpg").and_return(:smallest_url)
164
+ @webpage.stub!(:image_url).with("/images/kitty.jpg").and_return(:kitty_url)
165
+ @webpage.stub!(:image_url).with("images/biggest.jpg").and_return(:biggest_url)
166
+ end
167
+
168
+ it "should return all urls of found images" do
169
+ urls = @webpage.attached_images_urls
170
+ urls.should include(:smallest_url)
171
+ urls.should include(:kitty_url)
172
+ urls.should include(:biggest_url)
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/thumbnail_scraper/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jeppe Liisberg", "Jan Filipowski"]
6
+ gem.email = ["jachuf@gmail.com"]
7
+ gem.description = %q{detect, fetch and generate a thumbnail for any url and store it on s3}
8
+ gem.summary = %q{detect, fetch and generate a thumbnail for any url and store it on s3}
9
+ gem.homepage = ""
10
+
11
+ gem.add_development_dependency "rspec"
12
+ gem.add_development_dependency "webmock"
13
+ gem.add_runtime_dependency "httpclient"
14
+ gem.add_runtime_dependency "fastimage"
15
+ gem.add_runtime_dependency "nokogiri"
16
+
17
+ gem.files = `git ls-files`.split($\)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.name = "thumbnail_scraper"
21
+ gem.require_paths = ["lib"]
22
+ gem.version = ThumbnailScraper::VERSION
23
+ end
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: thumbnail_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeppe Liisberg
9
+ - Jan Filipowski
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-07-30 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ requirement: !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ! '>='
29
+ - !ruby/object:Gem::Version
30
+ version: '0'
31
+ - !ruby/object:Gem::Dependency
32
+ name: webmock
33
+ requirement: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ type: :development
40
+ prerelease: false
41
+ version_requirements: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: httpclient
49
+ requirement: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ - !ruby/object:Gem::Dependency
64
+ name: fastimage
65
+ requirement: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ! '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ type: :runtime
72
+ prerelease: false
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ - !ruby/object:Gem::Dependency
80
+ name: nokogiri
81
+ requirement: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ type: :runtime
88
+ prerelease: false
89
+ version_requirements: !ruby/object:Gem::Requirement
90
+ none: false
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ description: detect, fetch and generate a thumbnail for any url and store it on s3
96
+ email:
97
+ - jachuf@gmail.com
98
+ executables: []
99
+ extensions: []
100
+ extra_rdoc_files: []
101
+ files:
102
+ - .gitignore
103
+ - Gemfile
104
+ - Gemfile.lock
105
+ - LICENSE
106
+ - README.md
107
+ - Rakefile
108
+ - lib/thumbnail_scraper.rb
109
+ - lib/thumbnail_scraper/http_receiver.rb
110
+ - lib/thumbnail_scraper/image.rb
111
+ - lib/thumbnail_scraper/version.rb
112
+ - lib/thumbnail_scraper/webpage.rb
113
+ - lib/thumbnail_scraper/with_smart_url.rb
114
+ - spec/sample_pages/images.html
115
+ - spec/sample_pages/images/biggest.jpg
116
+ - spec/sample_pages/images/kitty.jpg
117
+ - spec/sample_pages/images/subcatalog/smallest.jpg
118
+ - spec/sample_pages/img_src.html
119
+ - spec/sample_pages/og_image.html
120
+ - spec/thumbnail_scraper_helper.rb
121
+ - spec/thumbnail_scraper_spec.rb
122
+ - spec/units/http_receiver_spec.rb
123
+ - spec/units/thumbnail_scraper_spec.rb
124
+ - spec/units/webpage_spec.rb
125
+ - thumbnail_scraper.gemspec
126
+ homepage: ''
127
+ licenses: []
128
+ post_install_message:
129
+ rdoc_options: []
130
+ require_paths:
131
+ - lib
132
+ required_ruby_version: !ruby/object:Gem::Requirement
133
+ none: false
134
+ requirements:
135
+ - - ! '>='
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 1.8.24
147
+ signing_key:
148
+ specification_version: 3
149
+ summary: detect, fetch and generate a thumbnail for any url and store it on s3
150
+ test_files:
151
+ - spec/sample_pages/images.html
152
+ - spec/sample_pages/images/biggest.jpg
153
+ - spec/sample_pages/images/kitty.jpg
154
+ - spec/sample_pages/images/subcatalog/smallest.jpg
155
+ - spec/sample_pages/img_src.html
156
+ - spec/sample_pages/og_image.html
157
+ - spec/thumbnail_scraper_helper.rb
158
+ - spec/thumbnail_scraper_spec.rb
159
+ - spec/units/http_receiver_spec.rb
160
+ - spec/units/thumbnail_scraper_spec.rb
161
+ - spec/units/webpage_spec.rb