RubyGems - image_scraper - Versions diffs - 0.1.6 → 0.1.7 - Mend

image_scraper 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.1.6
1	+ 0.1.7

data/image_scraper.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{image_scraper}
-  s.version = "0.1.6"
+  s.version = "0.1.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["John McAliley"]
-  s.date = %q{2011-12-22}
+  s.date = %q{2012-01-02}
   s.description = %q{Simple utility to pull image urls from web page}
   s.email = %q{john.mcaliley@gmail.com}
   s.extra_rdoc_files = [

data/lib/image_scraper/client.rb CHANGED

@@ -1,7 +1,7 @@
 module ImageScraper
   class Client
     attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
     def initialize(url,options={})
       options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
       @url = URI.escape(url)
@@ -11,31 +11,31 @@ module ImageScraper
       html = open(@url).read rescue nil
       @doc = html ? Nokogiri::HTML(html) : nil
     end
     def image_urls
       images = page_images
       images += stylesheet_images if include_css_images
       images
     end
     def page_images
       urls = []
       return urls if doc.blank?
       doc.xpath("//img").each do |img|
         next if img["src"].blank?
-        image = URI.escape(img["src"])
+        image = URI.escape(img["src"].strip)
         image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
         urls << image
       end
       urls
     end
     def stylesheet_images
       images = []
       stylesheets.each do |stylesheet|
         file = open(stylesheet)
         css = file.string rescue IO.read(file)
         images += css.scan(/url\((.*?)\)/).collect do |image_url|
           image_url = URI.escape image_url[0]
           if image_url.include?("data:image") and @include_css_data_images
@@ -48,7 +48,7 @@ module ImageScraper
       end
       images
     end
     def stylesheets
       return [] if doc.blank?
       doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|

data/test/test_image_scraper.rb CHANGED

@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
     assert_equal images, scraper.image_urls
   end
+  should "return a list of images with whitespace stripped from the src" do
+    client = ImageScraper::Client.new("http://www.google.com")
+    html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
+    client.doc = Nokogiri::HTML(html)
+    images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
+    assert_equal images, client.image_urls
+  end
   should "return list of all image urls on a web page with relative paths" do
     images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
      "//bits.wikimedia.org/images/wikimedia-button.png",
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
     domain = "http://test.com"
     assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
   end
   should "return proper absolute url for a page and asset" do
     assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
     assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
     assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
     assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
   end
   should "return images from a stylesheet" do
     scraper = ImageScraper::Client.new("http://couponshack.com")
     assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
   end
   should "strip quotes from a url" do
     assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
     assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
     assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
   end
   should "return domain section from a url" do
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
     assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
     assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
   end
   should "return nil for doc if URL is invalid" do
     scraper = ImageScraper::Client.new("couponshack.com")
     assert scraper.doc.nil?
   end
   should "return empty arrays if URL is invalid" do
     scraper = ImageScraper::Client.new("couponshack.com")
     assert_equal [], scraper.image_urls
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
     scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
     assert_equal images, scraper.image_urls
   end
   should "Handle a page image with an unescaped url" do
     scraper = ImageScraper::Client.new ''
     scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
     assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
-  end
+  end
   should "Handle a stylesheet with an unescaped url" do
     scraper = ImageScraper::Client.new ''
     scraper.url = 'http://test.com'
     scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
     assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
-  end
+  end
   should "Handle a stylesheet image with an unescaped url" do
     scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
     assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
-  end
+  end
   should "Handle a stylesheet image with a relative url" do
     scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
     assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
-  end
+  end
 end

metadata CHANGED

@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 6
+  - 7
   segments_generated: true
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - John McAliley
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-12-22 00:00:00 -05:00
+date: 2012-01-02 00:00:00 -05:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: -3072759905091488701
+      hash: -168406416917257246
       segments:
       - 0
       segments_generated: true