rcarvalho-image_scraper 0.1.8.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 73a8c72ad130d838e31a907af63dbbbf4fea23a2
4
+ data.tar.gz: 875cfa02ec74260cd446e59e00dd50e9100796fc
5
+ SHA512:
6
+ metadata.gz: 0d0f8de281e7a86b6d8ce0353c2feff71dc37ac50ee6735fa2a1cfd045dce4037b2470d6103ec720c97089cb1b8d6927358cee122fc47e14f72b6c1b5b457980
7
+ data.tar.gz: 84e2767fb1d89f94e5ef65f816b2bd93d249b3fca72e0cf641a246a272fcbe6b913beac1265603e4332afab5610f43db81758063b619233ce3023d8680c75764
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ gem "nokogiri"
7
+ gem "css_parser"
8
+ gem "rails"
9
+
10
+ # Add dependencies to develop your gem here.
11
+ # Include everything needed to run rake, tests, features, etc.
12
+ group :development do
13
+ gem "shoulda", ">= 0"
14
+ gem "bundler", "~> 1.2"
15
+ gem "jeweler", "~> 1.5"
16
+ #gem "rcov", ">= 0"
17
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 John McAliley
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,51 @@
1
+ # image_scraper
2
+
3
+ [![Build Status](https://travis-ci.org/charlotte-ruby/image_scraper.png?branch=master)](http://travis-ci.org/charlotte-ruby/image_scraper)
4
+
5
+ Simple utility that pulls image URLS from web page
6
+
7
+ ## INSTALL
8
+
9
+ Add to your gemfile
10
+
11
+ gem "image_scraper"
12
+
13
+ Install w/ Bundler
14
+
15
+ bundle install
16
+
17
+ ## USAGE
18
+
19
+ Initialize the image scraper client
20
+
21
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org")
22
+
23
+ You can also pass an options hash to the client when you initialize it:
24
+
25
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org", options)
26
+ # OPTIONS - If you don't pass the option, it will default to true
27
+ # :convert_to_absolute_url - If there are relative image URLS, it will convert them to absolute URLS.
28
+ # :include_css_images - If there are stylesheets on the page, it will pull images out of the stylesheet. For example: background: url(/images/some-image.png).
29
+ # :include_css_data_images - Will include data images from CSS. For example: data:image/gif;base64,R0lGODlhEAAOALMAAOazToeH............
30
+
31
+ Get the images from the url specified when you initialized the client:
32
+
33
+ image_scraper.image_urls
34
+
35
+ This will return an array of strings.
36
+
37
+ ## Contributing to image_scraper
38
+
39
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
40
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
41
+ * Fork the project
42
+ * Start a feature/bugfix branch
43
+ * Commit and push until you are happy with your contribution
44
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
45
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
46
+
47
+ ## Copyright
48
+
49
+ Copyright (c) 2011 John McAliley. See LICENSE.txt for
50
+ further details.
51
+
data/Rakefile ADDED
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "image_scraper"
16
+ gem.homepage = "http://github.com/charlotte-ruby/image_scraper"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{Simple utility to pull image urls from web page}
19
+ gem.description = %Q{Simple utility to pull image urls from web page}
20
+ gem.email = "john.mcaliley@gmail.com"
21
+ gem.authors = ["John McAliley"]
22
+ gem.add_dependency "nokogiri"
23
+ gem.add_dependency "css_parser"
24
+ gem.files.exclude "test/**/*"
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ #require 'rcov/rcovtask'
36
+ #Rcov::RcovTask.new do |test|
37
+ # test.libs << 'test'
38
+ # test.pattern = 'test/**/test_*.rb'
39
+ # test.verbose = true
40
+ #end
41
+
42
+ task :default => :test
43
+
44
+ require 'rdoc/task'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "image_scraper #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.8
@@ -0,0 +1,78 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "rcarvalho-image_scraper"
8
+ s.version = "0.1.8.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["John McAliley"]
12
+ s.date = "2013-10-23"
13
+ s.description = "Simple utility to pull image urls from web page"
14
+ s.email = "john.mcaliley@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "LICENSE.txt",
23
+ "README.md",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "image_scraper.gemspec",
27
+ "lib/image_scraper.rb",
28
+ "lib/image_scraper/client.rb",
29
+ "lib/image_scraper/railtie.rb",
30
+ "lib/image_scraper/util.rb"
31
+ ]
32
+ s.homepage = "http://github.com/charlotte-ruby/image_scraper"
33
+ s.licenses = ["MIT"]
34
+ s.require_paths = ["lib"]
35
+ s.rubygems_version = "1.8.24"
36
+ s.summary = "Simple utility to pull image urls from web page"
37
+ s.test_files = [
38
+ "test/helper.rb",
39
+ "test/test_image_scraper.rb"
40
+ ]
41
+
42
+ if s.respond_to? :specification_version then
43
+ s.specification_version = 3
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
47
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
48
+ s.add_runtime_dependency(%q<rails>, [">= 0"])
49
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
50
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
51
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
52
+ s.add_development_dependency(%q<rcov>, [">= 0"])
53
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
54
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
55
+ else
56
+ s.add_dependency(%q<nokogiri>, [">= 0"])
57
+ s.add_dependency(%q<css_parser>, [">= 0"])
58
+ s.add_dependency(%q<rails>, [">= 0"])
59
+ s.add_dependency(%q<shoulda>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
61
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
62
+ s.add_dependency(%q<rcov>, [">= 0"])
63
+ s.add_dependency(%q<nokogiri>, [">= 0"])
64
+ s.add_dependency(%q<css_parser>, [">= 0"])
65
+ end
66
+ else
67
+ s.add_dependency(%q<nokogiri>, [">= 0"])
68
+ s.add_dependency(%q<css_parser>, [">= 0"])
69
+ s.add_dependency(%q<rails>, [">= 0"])
70
+ s.add_dependency(%q<shoulda>, [">= 0"])
71
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
72
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
73
+ s.add_dependency(%q<rcov>, [">= 0"])
74
+ s.add_dependency(%q<nokogiri>, [">= 0"])
75
+ s.add_dependency(%q<css_parser>, [">= 0"])
76
+ end
77
+ end
78
+
@@ -0,0 +1,62 @@
1
+ require 'cgi'
2
+ module ImageScraper
3
+ class Client
4
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
5
+
6
+ def initialize(url,options={})
7
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
8
+ @url = URI.escape(url)
9
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
10
+ @include_css_images = options[:include_css_images]
11
+ @include_css_data_images = options[:include_css_data_images]
12
+ html = open(@url).read rescue nil
13
+ @doc = html ? Nokogiri::HTML(html) : nil
14
+ end
15
+
16
+ def image_urls
17
+ images = page_images
18
+ images += stylesheet_images if include_css_images
19
+ images
20
+ end
21
+
22
+ def page_images
23
+ urls = []
24
+ return urls if doc.blank?
25
+ doc.xpath("//img").each do |img|
26
+ next if img["src"].blank?
27
+ image = URI.escape(img["src"].strip)
28
+ image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
29
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
30
+ urls << image
31
+ end
32
+ urls
33
+ end
34
+
35
+ def stylesheet_images
36
+ images = []
37
+ stylesheets.each do |stylesheet|
38
+ file = open(stylesheet) rescue next
39
+ css = file.string rescue IO.read(file) rescue next
40
+
41
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
42
+ image_url = URI.escape image_url[0]
43
+ image_url = image_url.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
44
+ if image_url.include?("data:image") and @include_css_data_images
45
+ image_url
46
+ else
47
+ image_url = ImageScraper::Util.strip_quotes(image_url)
48
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
49
+ end
50
+ end
51
+ end
52
+ images
53
+ end
54
+
55
+ def stylesheets
56
+ return [] if doc.blank?
57
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
58
+ ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,7 @@
1
+ require 'rails'
2
+ require 'image_scraper'
3
+
4
+ module ImageScraper
5
+ class Railtie < Rails::Railtie
6
+ end
7
+ end
@@ -0,0 +1,32 @@
1
+ module ImageScraper
2
+ module Util
3
+ def self.absolute_url(url,asset=nil)
4
+ # TODO - what happens when an index redirect occurs?
5
+ # Example: 'http://example.com/about' specified as url
6
+ # 'style.css' specified as asset
7
+ # url redirects to 'http://example.com/about/'
8
+ # and serves http://example.com/about/index.html
9
+ # which then links to the relative asset path 'style.css'
10
+ # based on original url (http://example.com/about),
11
+ # self.absolute_url gives
12
+ # 'http://example.com/style.css
13
+ # but should get:
14
+ # 'http://example.com/about/style.css
15
+ URI.parse(url).merge(URI.parse asset.to_s).to_s
16
+ end
17
+
18
+ def self.domain(url)
19
+ uri = URI.parse(url)
20
+ "#{uri.scheme}://#{uri.host}"
21
+ end
22
+
23
+ def self.path(url)
24
+ uri = URI.parse(url)
25
+ uri.path
26
+ end
27
+
28
+ def self.strip_quotes(image_url)
29
+ image_url.sub(/^%22/,'').sub(/%22$/,'').gsub("'","").gsub('"','')
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,7 @@
1
+ require 'rails'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ require 'image_scraper/railtie'
6
+ require 'image_scraper/util'
7
+ require 'image_scraper/client'
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'image_scraper'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,140 @@
1
+ require 'pp'
2
+ require 'helper'
3
+
4
+
5
+ #TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
6
+ # Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
7
+
8
+ class TestImageScraper < Test::Unit::TestCase
9
+ should "parse urls even with escaped (%22) double quotes in them" do
10
+ scraper = ImageScraper::Client.new "http://newscorp.com/careers/"
11
+ end
12
+
13
+ should "return list of all image urls on a web page with absolute paths" do
14
+ images = ["http://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg",
15
+ "http://bits.wikimedia.org/static-1.21wmf9/skins/common/images/magnify-clip.png",
16
+ "http://bits.wikimedia.org/static-1.21wmf9/skins/vector/images/search-ltr.png?303-4",
17
+ "http://bits.wikimedia.org/images/wikimedia-button.png",
18
+ "http://bits.wikimedia.org/static-1.21wmf9/skins/common/images/poweredby_mediawiki_88x31.png"]
19
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
20
+
21
+ assert_equal images.size, scraper.image_urls.size
22
+
23
+ assert_equal images, scraper.image_urls
24
+ end
25
+
26
+ should "return a list of images with whitespace stripped from the src" do
27
+ client = ImageScraper::Client.new("http://www.google.com")
28
+ html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
29
+ client.doc = Nokogiri::HTML(html)
30
+ images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
31
+
32
+ assert_equal images, client.image_urls
33
+ end
34
+
35
+ should "return list of all image urls on a web page with relative paths" do
36
+ images = ["//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg",
37
+ "//bits.wikimedia.org/static-1.21wmf9/skins/common/images/magnify-clip.png",
38
+ "//bits.wikimedia.org/static-1.21wmf9/skins/vector/images/search-ltr.png?303-4",
39
+ "//bits.wikimedia.org/images/wikimedia-button.png",
40
+ "//bits.wikimedia.org/static-1.21wmf9/skins/common/images/poweredby_mediawiki_88x31.png"]
41
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
42
+
43
+ assert_equal images.size, scraper.image_urls.size
44
+ assert_equal images, scraper.image_urls
45
+ end
46
+
47
+ should "return list of stylesheets contained in html page (relative path)" do
48
+ scraper = ImageScraper::Client.new ""
49
+ scraper.doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
50
+ scraper.url = "http://test.com"
51
+
52
+ assert_equal ["http://test.com/css/master.css", "http://test.com/css/master2.css"], scraper.stylesheets
53
+ end
54
+
55
+ should "return proper absolute url for a page and asset" do
56
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
57
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
58
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","/images/image.gif")
59
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","image.gif")
60
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
61
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","images/image.gif")
62
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
63
+ assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
64
+ assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
65
+ end
66
+
67
+ should "return images from a stylesheet" do
68
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
69
+ assert scraper.stylesheet_images.include? 'https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'
70
+ end
71
+
72
+ should "strip quotes from a url" do
73
+ assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
74
+ assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
75
+ assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
76
+ end
77
+
78
+ should "return domain section from a url" do
79
+ assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
80
+ assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
81
+ assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
82
+ assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
83
+ end
84
+
85
+ should "return nil for doc if URL is invalid" do
86
+ scraper = ImageScraper::Client.new("couponshack.com")
87
+ assert scraper.doc.nil?
88
+ end
89
+
90
+ should "return empty arrays if URL is invalid" do
91
+ scraper = ImageScraper::Client.new("couponshack.com")
92
+ assert_equal [], scraper.image_urls
93
+ assert_equal [], scraper.stylesheets
94
+ assert_equal [], scraper.stylesheet_images
95
+ assert_equal [], scraper.page_images
96
+ end
97
+
98
+ should "Handle a URL with unescaped spaces" do
99
+ images = ["https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/image1.png"]
100
+ scraper = ImageScraper::Client.new 'https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/space in url.html', :include_css_images => false
101
+ assert_equal images, scraper.image_urls
102
+ end
103
+
104
+ should "Handle a page image with an unescaped url" do
105
+ scraper = ImageScraper::Client.new ''
106
+ scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
107
+ assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
108
+ end
109
+
110
+ should "Handle a stylesheet with an unescaped url" do
111
+ scraper = ImageScraper::Client.new ''
112
+ scraper.url = 'http://test.com'
113
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
114
+ assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
115
+ end
116
+
117
+ should "Handle a stylesheet image with an unescaped url" do
118
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
119
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
120
+ end
121
+
122
+ should "Handle a stylesheet image with a relative url" do
123
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
124
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
125
+ end
126
+
127
+ should "Handle cases where a stylesheet returns a 404" do
128
+ scraper = ImageScraper::Client.new ''
129
+ scraper.url = 'http://google.com'
130
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://google.com/does_not_exist.css'>")
131
+ assert_equal [], scraper.stylesheet_images
132
+ end
133
+
134
+ should "not crash when it encounters image URLs that include square brackets" do
135
+ scraper = ImageScraper::Client.new ''
136
+ scraper.url = 'http://google.com'
137
+ scraper.doc = Nokogiri::HTML("<img src='image[1].jpg' >")
138
+ assert_equal ["http://google.com/image%5B1%5D.jpg"], scraper.page_images
139
+ end
140
+ end
metadata ADDED
@@ -0,0 +1,186 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rcarvalho-image_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.8.1
5
+ platform: ruby
6
+ authors:
7
+ - John McAliley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: css_parser
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rails
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: shoulda
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: 1.0.0
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 1.0.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: jeweler
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 1.5.2
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.5.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: rcov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: nokogiri
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: css_parser
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: Simple utility to pull image urls from web page
140
+ email: john.mcaliley@gmail.com
141
+ executables: []
142
+ extensions: []
143
+ extra_rdoc_files:
144
+ - LICENSE.txt
145
+ - README.md
146
+ files:
147
+ - .document
148
+ - Gemfile
149
+ - LICENSE.txt
150
+ - README.md
151
+ - Rakefile
152
+ - VERSION
153
+ - image_scraper.gemspec
154
+ - lib/image_scraper.rb
155
+ - lib/image_scraper/client.rb
156
+ - lib/image_scraper/railtie.rb
157
+ - lib/image_scraper/util.rb
158
+ - test/helper.rb
159
+ - test/test_image_scraper.rb
160
+ homepage: http://github.com/charlotte-ruby/image_scraper
161
+ licenses:
162
+ - MIT
163
+ metadata: {}
164
+ post_install_message:
165
+ rdoc_options: []
166
+ require_paths:
167
+ - lib
168
+ required_ruby_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - '>='
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ required_rubygems_version: !ruby/object:Gem::Requirement
174
+ requirements:
175
+ - - '>='
176
+ - !ruby/object:Gem::Version
177
+ version: '0'
178
+ requirements: []
179
+ rubyforge_project:
180
+ rubygems_version: 2.0.4
181
+ signing_key:
182
+ specification_version: 3
183
+ summary: Simple utility to pull image urls from web page
184
+ test_files:
185
+ - test/helper.rb
186
+ - test/test_image_scraper.rb