image_scraper 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.4
1
+ 0.1.5
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.4"
8
+ s.version = "0.1.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-07-03}
12
+ s.date = %q{2011-11-30}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -4,12 +4,12 @@ module ImageScraper
4
4
 
5
5
  def initialize(url,options={})
6
6
  options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
- @url = url
7
+ @url = URI.escape(url)
8
8
  @convert_to_absolute_url = options[:convert_to_absolute_url]
9
9
  @include_css_images = options[:include_css_images]
10
10
  @include_css_data_images = options[:include_css_data_images]
11
- html = open(url).read
12
- @doc = Nokogiri::HTML(html)
11
+ html = open(@url).read rescue nil
12
+ @doc = html ? Nokogiri::HTML(html) : nil
13
13
  end
14
14
 
15
15
  def image_urls
@@ -20,8 +20,9 @@ module ImageScraper
20
20
 
21
21
  def page_images
22
22
  urls = []
23
+ return urls if doc.blank?
23
24
  doc.xpath("//img").each do |img|
24
- image = img["src"]
25
+ image = URI.escape(img["src"])
25
26
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
26
27
  urls << image
27
28
  end
@@ -35,7 +36,7 @@ module ImageScraper
35
36
  css = file.string rescue IO.read(file)
36
37
 
37
38
  images += css.scan(/url\((.*?)\)/).collect do |image_url|
38
- image_url = image_url[0]
39
+ image_url = URI.escape image_url[0]
39
40
  if image_url.include?("data:image") and @include_css_data_images
40
41
  image_url
41
42
  else
@@ -48,9 +49,10 @@ module ImageScraper
48
49
  end
49
50
 
50
51
  def stylesheets
52
+ return [] if doc.blank?
51
53
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
52
- ImageScraper::Util.absolute_url(url,stylesheet['href'])
54
+ URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
53
55
  end
54
56
  end
55
57
  end
56
- end
58
+ end
@@ -3,23 +3,21 @@ require 'helper'
3
3
 
4
4
 
5
5
  #TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
6
+ # Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
7
+
6
8
  class TestImageScraper < Test::Unit::TestCase
7
9
  should "return list of all image urls on a web page with absolute paths" do
8
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
9
- "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
10
- "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-3",
11
- "http://bits.wikimedia.org/images/wikimedia-button.png",
12
- "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
10
+ images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
11
+ "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
12
+ "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
13
13
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
14
  assert_equal images, scraper.image_urls
15
15
  end
16
16
 
17
17
  should "return list of all image urls on a web page with relative paths" do
18
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
19
- "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
20
- "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-3",
21
- "http://bits.wikimedia.org/images/wikimedia-button.png",
22
- "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
18
+ images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
19
+ "//bits.wikimedia.org/images/wikimedia-button.png",
20
+ "//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
23
21
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
24
22
  assert_equal images, scraper.image_urls
25
23
  end
@@ -27,7 +25,7 @@ class TestImageScraper < Test::Unit::TestCase
27
25
  should "return list of stylesheets contained in html page (relative path)" do
28
26
  doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
29
27
  domain = "http://test.com"
30
- assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
28
+ assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
31
29
  end
32
30
 
33
31
  should "return proper absolute url for a page and asset" do
@@ -43,8 +41,8 @@ class TestImageScraper < Test::Unit::TestCase
43
41
  end
44
42
 
45
43
  should "return images from a stylesheet" do
46
- scraper = ImageScraper::Client.new("http://local.couponshack.com")
47
- assert scraper.stylesheet_images.include? ("http://local.couponshack.com/images/bg.png")
44
+ scraper = ImageScraper::Client.new("http://couponshack.com")
45
+ assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
48
46
  end
49
47
 
50
48
  should "strip quotes from a url" do
@@ -59,4 +57,42 @@ class TestImageScraper < Test::Unit::TestCase
59
57
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
60
58
  assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
61
59
  end
60
+
61
+ should "return nil for doc if URL is invalid" do
62
+ scraper = ImageScraper::Client.new("couponshack.com")
63
+ assert scraper.doc.nil?
64
+ end
65
+
66
+ should "return empty arrays if URL is invalid" do
67
+ scraper = ImageScraper::Client.new("couponshack.com")
68
+ assert_equal [], scraper.image_urls
69
+ assert_equal [], scraper.stylesheets
70
+ assert_equal [], scraper.stylesheet_images
71
+ assert_equal [], scraper.page_images
72
+ end
73
+
74
+ should "Handle a URL with unescaped spaces" do
75
+ images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
76
+ "http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
77
+ "http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
78
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
+ assert_equal images, scraper.image_urls
80
+ end
81
+
82
+ should "Handle a page image with an unescaped url" do
83
+ scraper = ImageScraper::Client.new ''
84
+ scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
85
+ assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
+ end
87
+
88
+ should "Handle a stylesheet with an unescaped url" do
89
+ scraper = ImageScraper::Client.new ''
90
+ scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
91
+ assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
92
+ end
93
+
94
+ should "Handle a stylesheet image with an unescaped url" do
95
+ scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
96
+ assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
97
+ end
62
98
  end
metadata CHANGED
@@ -5,8 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 4
9
- version: 0.1.4
8
+ - 5
9
+ segments_generated: true
10
+ version: 0.1.5
10
11
  platform: ruby
11
12
  authors:
12
13
  - John McAliley
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2011-07-03 00:00:00 -04:00
18
+ date: 2011-11-30 00:00:00 -05:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
@@ -26,6 +27,7 @@ dependencies:
26
27
  - !ruby/object:Gem::Version
27
28
  segments:
28
29
  - 0
30
+ segments_generated: true
29
31
  version: "0"
30
32
  type: :runtime
31
33
  prerelease: false
@@ -39,6 +41,7 @@ dependencies:
39
41
  - !ruby/object:Gem::Version
40
42
  segments:
41
43
  - 0
44
+ segments_generated: true
42
45
  version: "0"
43
46
  type: :runtime
44
47
  prerelease: false
@@ -52,6 +55,7 @@ dependencies:
52
55
  - !ruby/object:Gem::Version
53
56
  segments:
54
57
  - 0
58
+ segments_generated: true
55
59
  version: "0"
56
60
  type: :runtime
57
61
  prerelease: false
@@ -65,6 +69,7 @@ dependencies:
65
69
  - !ruby/object:Gem::Version
66
70
  segments:
67
71
  - 0
72
+ segments_generated: true
68
73
  version: "0"
69
74
  type: :development
70
75
  prerelease: false
@@ -80,6 +85,7 @@ dependencies:
80
85
  - 1
81
86
  - 0
82
87
  - 0
88
+ segments_generated: true
83
89
  version: 1.0.0
84
90
  type: :development
85
91
  prerelease: false
@@ -95,6 +101,7 @@ dependencies:
95
101
  - 1
96
102
  - 5
97
103
  - 2
104
+ segments_generated: true
98
105
  version: 1.5.2
99
106
  type: :development
100
107
  prerelease: false
@@ -108,6 +115,7 @@ dependencies:
108
115
  - !ruby/object:Gem::Version
109
116
  segments:
110
117
  - 0
118
+ segments_generated: true
111
119
  version: "0"
112
120
  type: :development
113
121
  prerelease: false
@@ -121,6 +129,7 @@ dependencies:
121
129
  - !ruby/object:Gem::Version
122
130
  segments:
123
131
  - 0
132
+ segments_generated: true
124
133
  version: "0"
125
134
  type: :runtime
126
135
  prerelease: false
@@ -134,6 +143,7 @@ dependencies:
134
143
  - !ruby/object:Gem::Version
135
144
  segments:
136
145
  - 0
146
+ segments_generated: true
137
147
  version: "0"
138
148
  type: :runtime
139
149
  prerelease: false
@@ -175,9 +185,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
185
  requirements:
176
186
  - - ">="
177
187
  - !ruby/object:Gem::Version
178
- hash: -830537873008219940
188
+ hash: -4020311873679732909
179
189
  segments:
180
190
  - 0
191
+ segments_generated: true
181
192
  version: "0"
182
193
  required_rubygems_version: !ruby/object:Gem::Requirement
183
194
  none: false
@@ -186,6 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
186
197
  - !ruby/object:Gem::Version
187
198
  segments:
188
199
  - 0
200
+ segments_generated: true
189
201
  version: "0"
190
202
  requirements: []
191
203