image_scraper 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.4
1
+ 0.1.5
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.4"
8
+ s.version = "0.1.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-07-03}
12
+ s.date = %q{2011-11-30}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -4,12 +4,12 @@ module ImageScraper
4
4
 
5
5
  def initialize(url,options={})
6
6
  options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
- @url = url
7
+ @url = URI.escape(url)
8
8
  @convert_to_absolute_url = options[:convert_to_absolute_url]
9
9
  @include_css_images = options[:include_css_images]
10
10
  @include_css_data_images = options[:include_css_data_images]
11
- html = open(url).read
12
- @doc = Nokogiri::HTML(html)
11
+ html = open(@url).read rescue nil
12
+ @doc = html ? Nokogiri::HTML(html) : nil
13
13
  end
14
14
 
15
15
  def image_urls
@@ -20,8 +20,9 @@ module ImageScraper
20
20
 
21
21
  def page_images
22
22
  urls = []
23
+ return urls if doc.blank?
23
24
  doc.xpath("//img").each do |img|
24
- image = img["src"]
25
+ image = URI.escape(img["src"])
25
26
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
26
27
  urls << image
27
28
  end
@@ -35,7 +36,7 @@ module ImageScraper
35
36
  css = file.string rescue IO.read(file)
36
37
 
37
38
  images += css.scan(/url\((.*?)\)/).collect do |image_url|
38
- image_url = image_url[0]
39
+ image_url = URI.escape image_url[0]
39
40
  if image_url.include?("data:image") and @include_css_data_images
40
41
  image_url
41
42
  else
@@ -48,9 +49,10 @@ module ImageScraper
48
49
  end
49
50
 
50
51
  def stylesheets
52
+ return [] if doc.blank?
51
53
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
52
- ImageScraper::Util.absolute_url(url,stylesheet['href'])
54
+ URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
53
55
  end
54
56
  end
55
57
  end
56
- end
58
+ end
@@ -3,23 +3,21 @@ require 'helper'
3
3
 
4
4
 
5
5
  #TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
6
+ # Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
7
+
6
8
  class TestImageScraper < Test::Unit::TestCase
7
9
  should "return list of all image urls on a web page with absolute paths" do
8
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
9
- "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
10
- "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-3",
11
- "http://bits.wikimedia.org/images/wikimedia-button.png",
12
- "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
10
+ images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
+ "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
12
+ "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
13
13
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
14
  assert_equal images, scraper.image_urls
15
15
  end
16
16
 
17
17
  should "return list of all image urls on a web page with relative paths" do
18
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
19
- "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
20
- "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-3",
21
- "http://bits.wikimedia.org/images/wikimedia-button.png",
22
- "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
18
+ images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
19
+ "//bits.wikimedia.org/images/wikimedia-button.png",
20
+ "//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
23
21
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
24
22
  assert_equal images, scraper.image_urls
25
23
  end
@@ -27,7 +25,7 @@ class TestImageScraper < Test::Unit::TestCase
27
25
  should "return list of stylesheets contained in html page (relative path)" do
28
26
  doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
29
27
  domain = "http://test.com"
30
- assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
28
+ assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
31
29
  end
32
30
 
33
31
  should "return proper absolute url for a page and asset" do
@@ -43,8 +41,8 @@ class TestImageScraper < Test::Unit::TestCase
43
41
  end
44
42
 
45
43
  should "return images from a stylesheet" do
46
- scraper = ImageScraper::Client.new("http://local.couponshack.com")
47
- assert scraper.stylesheet_images.include? ("http://local.couponshack.com/images/bg.png")
44
+ scraper = ImageScraper::Client.new("http://couponshack.com")
45
+ assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
48
46
  end
49
47
 
50
48
  should "strip quotes from a url" do
@@ -59,4 +57,42 @@ class TestImageScraper < Test::Unit::TestCase
59
57
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
60
58
  assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
61
59
  end
60
+
61
+ should "return nil for doc if URL is invalid" do
62
+ scraper = ImageScraper::Client.new("couponshack.com")
63
+ assert scraper.doc.nil?
64
+ end
65
+
66
+ should "return empty arrays if URL is invalid" do
67
+ scraper = ImageScraper::Client.new("couponshack.com")
68
+ assert_equal [], scraper.image_urls
69
+ assert_equal [], scraper.stylesheets
70
+ assert_equal [], scraper.stylesheet_images
71
+ assert_equal [], scraper.page_images
72
+ end
73
+
74
+ should "Handle a URL with unescaped spaces" do
75
+ images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
+ "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
77
+ "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
78
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
+ assert_equal images, scraper.image_urls
80
+ end
81
+
82
+ should "Handle a page image with an unescaped url" do
83
+ scraper = ImageScraper::Client.new ''
84
+ scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
85
+ assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
+ end
87
+
88
+ should "Handle a stylesheet with an unescaped url" do
89
+ scraper = ImageScraper::Client.new ''
90
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
91
+ assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
92
+ end
93
+
94
+ should "Handle a stylesheet image with an unescaped url" do
95
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
96
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
97
+ end
62
98
  end
metadata CHANGED
@@ -5,8 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 4
9
- version: 0.1.4
8
+ - 5
9
+ segments_generated: true
10
+ version: 0.1.5
10
11
  platform: ruby
11
12
  authors:
12
13
  - John McAliley
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2011-07-03 00:00:00 -04:00
18
+ date: 2011-11-30 00:00:00 -05:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
@@ -26,6 +27,7 @@ dependencies:
26
27
  - !ruby/object:Gem::Version
27
28
  segments:
28
29
  - 0
30
+ segments_generated: true
29
31
  version: "0"
30
32
  type: :runtime
31
33
  prerelease: false
@@ -39,6 +41,7 @@ dependencies:
39
41
  - !ruby/object:Gem::Version
40
42
  segments:
41
43
  - 0
44
+ segments_generated: true
42
45
  version: "0"
43
46
  type: :runtime
44
47
  prerelease: false
@@ -52,6 +55,7 @@ dependencies:
52
55
  - !ruby/object:Gem::Version
53
56
  segments:
54
57
  - 0
58
+ segments_generated: true
55
59
  version: "0"
56
60
  type: :runtime
57
61
  prerelease: false
@@ -65,6 +69,7 @@ dependencies:
65
69
  - !ruby/object:Gem::Version
66
70
  segments:
67
71
  - 0
72
+ segments_generated: true
68
73
  version: "0"
69
74
  type: :development
70
75
  prerelease: false
@@ -80,6 +85,7 @@ dependencies:
80
85
  - 1
81
86
  - 0
82
87
  - 0
88
+ segments_generated: true
83
89
  version: 1.0.0
84
90
  type: :development
85
91
  prerelease: false
@@ -95,6 +101,7 @@ dependencies:
95
101
  - 1
96
102
  - 5
97
103
  - 2
104
+ segments_generated: true
98
105
  version: 1.5.2
99
106
  type: :development
100
107
  prerelease: false
@@ -108,6 +115,7 @@ dependencies:
108
115
  - !ruby/object:Gem::Version
109
116
  segments:
110
117
  - 0
118
+ segments_generated: true
111
119
  version: "0"
112
120
  type: :development
113
121
  prerelease: false
@@ -121,6 +129,7 @@ dependencies:
121
129
  - !ruby/object:Gem::Version
122
130
  segments:
123
131
  - 0
132
+ segments_generated: true
124
133
  version: "0"
125
134
  type: :runtime
126
135
  prerelease: false
@@ -134,6 +143,7 @@ dependencies:
134
143
  - !ruby/object:Gem::Version
135
144
  segments:
136
145
  - 0
146
+ segments_generated: true
137
147
  version: "0"
138
148
  type: :runtime
139
149
  prerelease: false
@@ -175,9 +185,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
185
  requirements:
176
186
  - - ">="
177
187
  - !ruby/object:Gem::Version
178
- hash: -830537873008219940
188
+ hash: -4020311873679732909
179
189
  segments:
180
190
  - 0
191
+ segments_generated: true
181
192
  version: "0"
182
193
  required_rubygems_version: !ruby/object:Gem::Requirement
183
194
  none: false
@@ -186,6 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
186
197
  - !ruby/object:Gem::Version
187
198
  segments:
188
199
  - 0
200
+ segments_generated: true
189
201
  version: "0"
190
202
  requirements: []
191
203