image_scraper 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.5
1
+ 0.1.6
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.5"
8
+ s.version = "0.1.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-11-30}
12
+ s.date = %q{2011-12-22}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ module ImageScraper
22
22
  urls = []
23
23
  return urls if doc.blank?
24
24
  doc.xpath("//img").each do |img|
25
+ next if img["src"].blank?
25
26
  image = URI.escape(img["src"])
26
27
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
27
28
  urls << image
@@ -41,7 +42,7 @@ module ImageScraper
41
42
  image_url
42
43
  else
43
44
  image_url = ImageScraper::Util.strip_quotes(image_url)
44
- @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url) : image_url
45
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
45
46
  end
46
47
  end
47
48
  end
@@ -51,7 +52,7 @@ module ImageScraper
51
52
  def stylesheets
52
53
  return [] if doc.blank?
53
54
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
- URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
+ ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
55
56
  end
56
57
  end
57
58
  end
@@ -1,10 +1,18 @@
1
1
  module ImageScraper
2
2
  module Util
3
3
  def self.absolute_url(url,asset=nil)
4
- return url if asset.nil?
5
- return asset if asset.include?("://")
6
- return domain(url)+asset if asset[0]=="/"
7
- return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
4
+ # TODO - what happens when an index redirect occurs?
5
+ # Example: 'http://example.com/about' specified as url
6
+ # 'style.css' specified as asset
7
+ # url redirects to 'http://example.com/about/'
8
+ # and serves http://example.com/about/index.html
9
+ # which then links to the relative asset path 'style.css'
10
+ # based on original url (http://example.com/about),
11
+ # self.absolute_url gives
12
+ # 'http://example.com/style.css
13
+ # but should get:
14
+ # 'http://example.com/about/style.css
15
+ URI.parse(url).merge(URI.parse asset.to_s).to_s
8
16
  end
9
17
 
10
18
  def self.domain(url)
@@ -21,4 +29,4 @@ module ImageScraper
21
29
  image_url.gsub("'","").gsub('"','')
22
30
  end
23
31
  end
24
- end
32
+ end
@@ -7,9 +7,9 @@ require 'helper'
7
7
 
8
8
  class TestImageScraper < Test::Unit::TestCase
9
9
  should "return list of all image urls on a web page with absolute paths" do
10
- images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
- "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
12
- "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
10
+ images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
+ "http://bits.wikimedia.org/images/wikimedia-button.png",
12
+ "http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
13
13
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
14
  assert_equal images, scraper.image_urls
15
15
  end
@@ -72,22 +72,23 @@ class TestImageScraper < Test::Unit::TestCase
72
72
  end
73
73
 
74
74
  should "Handle a URL with unescaped spaces" do
75
- images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
- "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
77
- "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
75
+ images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
+ "http://bits.wikimedia.org/images/wikimedia-button.png",
77
+ "http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
78
78
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
79
  assert_equal images, scraper.image_urls
80
80
  end
81
81
 
82
82
  should "Handle a page image with an unescaped url" do
83
83
  scraper = ImageScraper::Client.new ''
84
- scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
84
+ scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
85
85
  assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
86
  end
87
87
 
88
88
  should "Handle a stylesheet with an unescaped url" do
89
89
  scraper = ImageScraper::Client.new ''
90
- scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
90
+ scraper.url = 'http://test.com'
91
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
91
92
  assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
92
93
  end
93
94
 
@@ -95,4 +96,9 @@ class TestImageScraper < Test::Unit::TestCase
95
96
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
96
97
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
97
98
  end
98
- end
99
+
100
+ should "Handle a stylesheet image with a relative url" do
101
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
102
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
103
+ end
104
+ end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 5
8
+ - 6
9
9
  segments_generated: true
10
- version: 0.1.5
10
+ version: 0.1.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - John McAliley
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-30 00:00:00 -05:00
18
+ date: 2011-12-22 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
185
  requirements:
186
186
  - - ">="
187
187
  - !ruby/object:Gem::Version
188
- hash: -4020311873679732909
188
+ hash: -3072759905091488701
189
189
  segments:
190
190
  - 0
191
191
  segments_generated: true