image_scraper 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.6
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-11-30}
12
+ s.date = %q{2011-12-22}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ module ImageScraper
22
22
  urls = []
23
23
  return urls if doc.blank?
24
24
  doc.xpath("//img").each do |img|
25
+ next if img["src"].blank?
25
26
  image = URI.escape(img["src"])
26
27
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
27
28
  urls << image
@@ -41,7 +42,7 @@ module ImageScraper
41
42
  image_url
42
43
  else
43
44
  image_url = ImageScraper::Util.strip_quotes(image_url)
44
- @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url) : image_url
45
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
45
46
  end
46
47
  end
47
48
  end
@@ -51,7 +52,7 @@ module ImageScraper
51
52
  def stylesheets
52
53
  return [] if doc.blank?
53
54
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
- URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
+ ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
55
56
  end
56
57
  end
57
58
  end
@@ -1,10 +1,18 @@
1
1
  module ImageScraper
2
2
  module Util
3
3
  def self.absolute_url(url,asset=nil)
4
- return url if asset.nil?
5
- return asset if asset.include?("://")
6
- return domain(url)+asset if asset[0]=="/"
7
- return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
4
+ # TODO - what happens when an index redirect occurs?
5
+ # Example: 'http://example.com/about' specified as url
6
+ # 'style.css' specified as asset
7
+ # url redirects to 'http://example.com/about/'
8
+ # and serves http://example.com/about/index.html
9
+ # which then links to the relative asset path 'style.css'
10
+ # based on original url (http://example.com/about),
11
+ # self.absolute_url gives
12
+ # 'http://example.com/style.css
13
+ # but should get:
14
+ # 'http://example.com/about/style.css
15
+ URI.parse(url).merge(URI.parse asset.to_s).to_s
8
16
  end
9
17
 
10
18
  def self.domain(url)
@@ -21,4 +29,4 @@ module ImageScraper
21
29
  image_url.gsub("'","").gsub('"','')
22
30
  end
23
31
  end
24
- end
32
+ end
@@ -7,9 +7,9 @@ require 'helper'
7
7
 
8
8
  class TestImageScraper < Test::Unit::TestCase
9
9
  should "return list of all image urls on a web page with absolute paths" do
10
- images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
- "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
12
- "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
10
+ images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
+ "http://bits.wikimedia.org/images/wikimedia-button.png",
12
+ "http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
13
13
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
14
  assert_equal images, scraper.image_urls
15
15
  end
@@ -72,22 +72,23 @@ class TestImageScraper < Test::Unit::TestCase
72
72
  end
73
73
 
74
74
  should "Handle a URL with unescaped spaces" do
75
- images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
- "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
77
- "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
75
+ images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
+ "http://bits.wikimedia.org/images/wikimedia-button.png",
77
+ "http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
78
78
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
79
  assert_equal images, scraper.image_urls
80
80
  end
81
81
 
82
82
  should "Handle a page image with an unescaped url" do
83
83
  scraper = ImageScraper::Client.new ''
84
- scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
84
+ scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
85
85
  assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
86
  end
87
87
 
88
88
  should "Handle a stylesheet with an unescaped url" do
89
89
  scraper = ImageScraper::Client.new ''
90
- scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
90
+ scraper.url = 'http://test.com'
91
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
91
92
  assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
92
93
  end
93
94
 
@@ -95,4 +96,9 @@ class TestImageScraper < Test::Unit::TestCase
95
96
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
96
97
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
97
98
  end
98
- end
99
+
100
+ should "Handle a stylesheet image with a relative url" do
101
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
102
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
103
+ end
104
+ end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 5
8
+ - 6
9
9
  segments_generated: true
10
- version: 0.1.5
10
+ version: 0.1.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - John McAliley
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-30 00:00:00 -05:00
18
+ date: 2011-12-22 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
185
  requirements:
186
186
  - - ">="
187
187
  - !ruby/object:Gem::Version
188
- hash: -4020311873679732909
188
+ hash: -3072759905091488701
189
189
  segments:
190
190
  - 0
191
191
  segments_generated: true