image_scraper 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +9 -7
- data/test/test_image_scraper.rb +49 -13
- metadata +16 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.5
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-11-30}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -4,12 +4,12 @@ module ImageScraper
|
|
4
4
|
|
5
5
|
def initialize(url,options={})
|
6
6
|
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
|
-
@url = url
|
7
|
+
@url = URI.escape(url)
|
8
8
|
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
9
9
|
@include_css_images = options[:include_css_images]
|
10
10
|
@include_css_data_images = options[:include_css_data_images]
|
11
|
-
html = open(url).read
|
12
|
-
@doc = Nokogiri::HTML(html)
|
11
|
+
html = open(@url).read rescue nil
|
12
|
+
@doc = html ? Nokogiri::HTML(html) : nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def image_urls
|
@@ -20,8 +20,9 @@ module ImageScraper
|
|
20
20
|
|
21
21
|
def page_images
|
22
22
|
urls = []
|
23
|
+
return urls if doc.blank?
|
23
24
|
doc.xpath("//img").each do |img|
|
24
|
-
image = img["src"]
|
25
|
+
image = URI.escape(img["src"])
|
25
26
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
26
27
|
urls << image
|
27
28
|
end
|
@@ -35,7 +36,7 @@ module ImageScraper
|
|
35
36
|
css = file.string rescue IO.read(file)
|
36
37
|
|
37
38
|
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
38
|
-
image_url = image_url[0]
|
39
|
+
image_url = URI.escape image_url[0]
|
39
40
|
if image_url.include?("data:image") and @include_css_data_images
|
40
41
|
image_url
|
41
42
|
else
|
@@ -48,9 +49,10 @@ module ImageScraper
|
|
48
49
|
end
|
49
50
|
|
50
51
|
def stylesheets
|
52
|
+
return [] if doc.blank?
|
51
53
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
52
|
-
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
54
|
+
URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
53
55
|
end
|
54
56
|
end
|
55
57
|
end
|
56
|
-
end
|
58
|
+
end
|
data/test/test_image_scraper.rb
CHANGED
@@ -3,23 +3,21 @@ require 'helper'
|
|
3
3
|
|
4
4
|
|
5
5
|
#TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
|
6
|
+
# Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
|
7
|
+
|
6
8
|
class TestImageScraper < Test::Unit::TestCase
|
7
9
|
should "return list of all image urls on a web page with absolute paths" do
|
8
|
-
images = ["http://
|
9
|
-
"http://bits.wikimedia.org/
|
10
|
-
"http://bits.wikimedia.org/skins-1.
|
11
|
-
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
-
"http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
|
10
|
+
images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
11
|
+
"http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
+
"http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
13
13
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
16
16
|
|
17
17
|
should "return list of all image urls on a web page with relative paths" do
|
18
|
-
images = ["
|
19
|
-
"
|
20
|
-
"
|
21
|
-
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
22
|
-
"http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
|
18
|
+
images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
19
|
+
"//bits.wikimedia.org/images/wikimedia-button.png",
|
20
|
+
"//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
23
21
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
|
24
22
|
assert_equal images, scraper.image_urls
|
25
23
|
end
|
@@ -27,7 +25,7 @@ class TestImageScraper < Test::Unit::TestCase
|
|
27
25
|
should "return list of stylesheets contained in html page (relative path)" do
|
28
26
|
doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
|
29
27
|
domain = "http://test.com"
|
30
|
-
assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
28
|
+
assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
31
29
|
end
|
32
30
|
|
33
31
|
should "return proper absolute url for a page and asset" do
|
@@ -43,8 +41,8 @@ class TestImageScraper < Test::Unit::TestCase
|
|
43
41
|
end
|
44
42
|
|
45
43
|
should "return images from a stylesheet" do
|
46
|
-
scraper = ImageScraper::Client.new("http://
|
47
|
-
assert scraper.stylesheet_images.include? ("http://
|
44
|
+
scraper = ImageScraper::Client.new("http://couponshack.com")
|
45
|
+
assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
|
48
46
|
end
|
49
47
|
|
50
48
|
should "strip quotes from a url" do
|
@@ -59,4 +57,42 @@ class TestImageScraper < Test::Unit::TestCase
|
|
59
57
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
|
60
58
|
assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
|
61
59
|
end
|
60
|
+
|
61
|
+
should "return nil for doc if URL is invalid" do
|
62
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
63
|
+
assert scraper.doc.nil?
|
64
|
+
end
|
65
|
+
|
66
|
+
should "return empty arrays if URL is invalid" do
|
67
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
68
|
+
assert_equal [], scraper.image_urls
|
69
|
+
assert_equal [], scraper.stylesheets
|
70
|
+
assert_equal [], scraper.stylesheet_images
|
71
|
+
assert_equal [], scraper.page_images
|
72
|
+
end
|
73
|
+
|
74
|
+
should "Handle a URL with unescaped spaces" do
|
75
|
+
images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
76
|
+
"http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
|
77
|
+
"http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
78
|
+
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
|
+
assert_equal images, scraper.image_urls
|
80
|
+
end
|
81
|
+
|
82
|
+
should "Handle a page image with an unescaped url" do
|
83
|
+
scraper = ImageScraper::Client.new ''
|
84
|
+
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
|
85
|
+
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
|
+
end
|
87
|
+
|
88
|
+
should "Handle a stylesheet with an unescaped url" do
|
89
|
+
scraper = ImageScraper::Client.new ''
|
90
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
|
91
|
+
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
92
|
+
end
|
93
|
+
|
94
|
+
should "Handle a stylesheet image with an unescaped url" do
|
95
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
96
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
97
|
+
end
|
62
98
|
end
|
metadata
CHANGED
@@ -5,8 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
|
8
|
+
- 5
|
9
|
+
segments_generated: true
|
10
|
+
version: 0.1.5
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- John McAliley
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-
|
18
|
+
date: 2011-11-30 00:00:00 -05:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -26,6 +27,7 @@ dependencies:
|
|
26
27
|
- !ruby/object:Gem::Version
|
27
28
|
segments:
|
28
29
|
- 0
|
30
|
+
segments_generated: true
|
29
31
|
version: "0"
|
30
32
|
type: :runtime
|
31
33
|
prerelease: false
|
@@ -39,6 +41,7 @@ dependencies:
|
|
39
41
|
- !ruby/object:Gem::Version
|
40
42
|
segments:
|
41
43
|
- 0
|
44
|
+
segments_generated: true
|
42
45
|
version: "0"
|
43
46
|
type: :runtime
|
44
47
|
prerelease: false
|
@@ -52,6 +55,7 @@ dependencies:
|
|
52
55
|
- !ruby/object:Gem::Version
|
53
56
|
segments:
|
54
57
|
- 0
|
58
|
+
segments_generated: true
|
55
59
|
version: "0"
|
56
60
|
type: :runtime
|
57
61
|
prerelease: false
|
@@ -65,6 +69,7 @@ dependencies:
|
|
65
69
|
- !ruby/object:Gem::Version
|
66
70
|
segments:
|
67
71
|
- 0
|
72
|
+
segments_generated: true
|
68
73
|
version: "0"
|
69
74
|
type: :development
|
70
75
|
prerelease: false
|
@@ -80,6 +85,7 @@ dependencies:
|
|
80
85
|
- 1
|
81
86
|
- 0
|
82
87
|
- 0
|
88
|
+
segments_generated: true
|
83
89
|
version: 1.0.0
|
84
90
|
type: :development
|
85
91
|
prerelease: false
|
@@ -95,6 +101,7 @@ dependencies:
|
|
95
101
|
- 1
|
96
102
|
- 5
|
97
103
|
- 2
|
104
|
+
segments_generated: true
|
98
105
|
version: 1.5.2
|
99
106
|
type: :development
|
100
107
|
prerelease: false
|
@@ -108,6 +115,7 @@ dependencies:
|
|
108
115
|
- !ruby/object:Gem::Version
|
109
116
|
segments:
|
110
117
|
- 0
|
118
|
+
segments_generated: true
|
111
119
|
version: "0"
|
112
120
|
type: :development
|
113
121
|
prerelease: false
|
@@ -121,6 +129,7 @@ dependencies:
|
|
121
129
|
- !ruby/object:Gem::Version
|
122
130
|
segments:
|
123
131
|
- 0
|
132
|
+
segments_generated: true
|
124
133
|
version: "0"
|
125
134
|
type: :runtime
|
126
135
|
prerelease: false
|
@@ -134,6 +143,7 @@ dependencies:
|
|
134
143
|
- !ruby/object:Gem::Version
|
135
144
|
segments:
|
136
145
|
- 0
|
146
|
+
segments_generated: true
|
137
147
|
version: "0"
|
138
148
|
type: :runtime
|
139
149
|
prerelease: false
|
@@ -175,9 +185,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
175
185
|
requirements:
|
176
186
|
- - ">="
|
177
187
|
- !ruby/object:Gem::Version
|
178
|
-
hash: -
|
188
|
+
hash: -4020311873679732909
|
179
189
|
segments:
|
180
190
|
- 0
|
191
|
+
segments_generated: true
|
181
192
|
version: "0"
|
182
193
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
194
|
none: false
|
@@ -186,6 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
186
197
|
- !ruby/object:Gem::Version
|
187
198
|
segments:
|
188
199
|
- 0
|
200
|
+
segments_generated: true
|
189
201
|
version: "0"
|
190
202
|
requirements: []
|
191
203
|
|