RubyGems - image_scraper - Versions diffs - 0.1.6 → 0.1.7 - Mend

image_scraper 0.1.6 → 0.1.7

Files changed (5) hide show

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.1.6
1	+ 0.1.7

data/image_scraper.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{image_scraper}
-  s.version = "0.1.6"
+  s.version = "0.1.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["John McAliley"]
-  s.date = %q{2011-12-22}
+  s.date = %q{2012-01-02}
   s.description = %q{Simple utility to pull image urls from web page}
   s.email = %q{john.mcaliley@gmail.com}
   s.extra_rdoc_files = [

data/lib/image_scraper/client.rb CHANGED

@@ -1,7 +1,7 @@
 module ImageScraper
   class Client
     attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
     def initialize(url,options={})
       options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
       @url = URI.escape(url)
@@ -11,31 +11,31 @@ module ImageScraper
       html = open(@url).read rescue nil
       @doc = html ? Nokogiri::HTML(html) : nil
     end
     def image_urls
       images = page_images
       images += stylesheet_images if include_css_images
       images
     end
     def page_images
       urls = []
       return urls if doc.blank?
       doc.xpath("//img").each do |img|
         next if img["src"].blank?
-        image = URI.escape(img["src"])
+        image = URI.escape(img["src"].strip)
         image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
         urls << image
       end
       urls
     end
     def stylesheet_images
       images = []
       stylesheets.each do |stylesheet|
         file = open(stylesheet)
         css = file.string rescue IO.read(file)
         images += css.scan(/url\((.*?)\)/).collect do |image_url|
           image_url = URI.escape image_url[0]
           if image_url.include?("data:image") and @include_css_data_images
@@ -48,7 +48,7 @@ module ImageScraper
       end
       images
     end
     def stylesheets
       return [] if doc.blank?
       doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|

data/test/test_image_scraper.rb CHANGED

@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
     assert_equal images, scraper.image_urls
   end
+  should "return a list of images with whitespace stripped from the src" do
+    client = ImageScraper::Client.new("http://www.google.com")
+    html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
+    client.doc = Nokogiri::HTML(html)
+    images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
+    assert_equal images, client.image_urls
+  end
   should "return list of all image urls on a web page with relative paths" do
     images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
      "//bits.wikimedia.org/images/wikimedia-button.png",
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
     domain = "http://test.com"
     assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
   end
   should "return proper absolute url for a page and asset" do
     assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
     assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
     assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
     assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
   end
   should "return images from a stylesheet" do
     scraper = ImageScraper::Client.new("http://couponshack.com")
     assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
   end
   should "strip quotes from a url" do
     assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
     assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
     assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
   end
   should "return domain section from a url" do
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
     assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
   end
   should "return nil for doc if URL is invalid" do
     scraper = ImageScraper::Client.new("couponshack.com")
     assert scraper.doc.nil?
   end
   should "return empty arrays if URL is invalid" do
     scraper = ImageScraper::Client.new("couponshack.com")
     assert_equal [], scraper.image_urls
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
     scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
     assert_equal images, scraper.image_urls
   end
   should "Handle a page image with an unescaped url" do
     scraper = ImageScraper::Client.new ''
     scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
     assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
-  end
+  end
   should "Handle a stylesheet with an unescaped url" do
     scraper = ImageScraper::Client.new ''
     scraper.url = 'http://test.com'
     scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
     assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
-  end
+  end
   should "Handle a stylesheet image with an unescaped url" do
     scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
     assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
-  end
+  end
   should "Handle a stylesheet image with a relative url" do
     scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
     assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
-  end
+  end
 end

metadata CHANGED

@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 6
+  - 7
   segments_generated: true
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - John McAliley
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-12-22 00:00:00 -05:00
+date: 2012-01-02 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: -3072759905091488701
+      hash: -168406416917257246
       segments:
       - 0
       segments_generated: true