image_scraper 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +9 -7
- data/test/test_image_scraper.rb +49 -13
- metadata +16 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.5
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-11-30}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -4,12 +4,12 @@ module ImageScraper
|
|
4
4
|
|
5
5
|
def initialize(url,options={})
|
6
6
|
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
|
-
@url = url
|
7
|
+
@url = URI.escape(url)
|
8
8
|
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
9
9
|
@include_css_images = options[:include_css_images]
|
10
10
|
@include_css_data_images = options[:include_css_data_images]
|
11
|
-
html = open(url).read
|
12
|
-
@doc = Nokogiri::HTML(html)
|
11
|
+
html = open(@url).read rescue nil
|
12
|
+
@doc = html ? Nokogiri::HTML(html) : nil
|
13
13
|
end
|
14
14
|
|
15
15
|
def image_urls
|
@@ -20,8 +20,9 @@ module ImageScraper
|
|
20
20
|
|
21
21
|
def page_images
|
22
22
|
urls = []
|
23
|
+
return urls if doc.blank?
|
23
24
|
doc.xpath("//img").each do |img|
|
24
|
-
image = img["src"]
|
25
|
+
image = URI.escape(img["src"])
|
25
26
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
26
27
|
urls << image
|
27
28
|
end
|
@@ -35,7 +36,7 @@ module ImageScraper
|
|
35
36
|
css = file.string rescue IO.read(file)
|
36
37
|
|
37
38
|
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
38
|
-
image_url = image_url[0]
|
39
|
+
image_url = URI.escape image_url[0]
|
39
40
|
if image_url.include?("data:image") and @include_css_data_images
|
40
41
|
image_url
|
41
42
|
else
|
@@ -48,9 +49,10 @@ module ImageScraper
|
|
48
49
|
end
|
49
50
|
|
50
51
|
def stylesheets
|
52
|
+
return [] if doc.blank?
|
51
53
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
52
|
-
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
54
|
+
URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
53
55
|
end
|
54
56
|
end
|
55
57
|
end
|
56
|
-
end
|
58
|
+
end
|
data/test/test_image_scraper.rb
CHANGED
@@ -3,23 +3,21 @@ require 'helper'
|
|
3
3
|
|
4
4
|
|
5
5
|
#TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
|
6
|
+
# Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
|
7
|
+
|
6
8
|
class TestImageScraper < Test::Unit::TestCase
|
7
9
|
should "return list of all image urls on a web page with absolute paths" do
|
8
|
-
images = ["http://
|
9
|
-
"http://bits.wikimedia.org/
|
10
|
-
"http://bits.wikimedia.org/skins-1.
|
11
|
-
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
-
"http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
|
10
|
+
images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
11
|
+
"http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
+
"http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
13
13
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
16
16
|
|
17
17
|
should "return list of all image urls on a web page with relative paths" do
|
18
|
-
images = ["
|
19
|
-
"
|
20
|
-
"
|
21
|
-
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
22
|
-
"http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
|
18
|
+
images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
19
|
+
"//bits.wikimedia.org/images/wikimedia-button.png",
|
20
|
+
"//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
23
21
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
|
24
22
|
assert_equal images, scraper.image_urls
|
25
23
|
end
|
@@ -27,7 +25,7 @@ class TestImageScraper < Test::Unit::TestCase
|
|
27
25
|
should "return list of stylesheets contained in html page (relative path)" do
|
28
26
|
doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
|
29
27
|
domain = "http://test.com"
|
30
|
-
assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
28
|
+
assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
31
29
|
end
|
32
30
|
|
33
31
|
should "return proper absolute url for a page and asset" do
|
@@ -43,8 +41,8 @@ class TestImageScraper < Test::Unit::TestCase
|
|
43
41
|
end
|
44
42
|
|
45
43
|
should "return images from a stylesheet" do
|
46
|
-
scraper = ImageScraper::Client.new("http://
|
47
|
-
assert scraper.stylesheet_images.include? ("http://
|
44
|
+
scraper = ImageScraper::Client.new("http://couponshack.com")
|
45
|
+
assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
|
48
46
|
end
|
49
47
|
|
50
48
|
should "strip quotes from a url" do
|
@@ -59,4 +57,42 @@ class TestImageScraper < Test::Unit::TestCase
|
|
59
57
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
|
60
58
|
assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
|
61
59
|
end
|
60
|
+
|
61
|
+
should "return nil for doc if URL is invalid" do
|
62
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
63
|
+
assert scraper.doc.nil?
|
64
|
+
end
|
65
|
+
|
66
|
+
should "return empty arrays if URL is invalid" do
|
67
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
68
|
+
assert_equal [], scraper.image_urls
|
69
|
+
assert_equal [], scraper.stylesheets
|
70
|
+
assert_equal [], scraper.stylesheet_images
|
71
|
+
assert_equal [], scraper.page_images
|
72
|
+
end
|
73
|
+
|
74
|
+
should "Handle a URL with unescaped spaces" do
|
75
|
+
images = ["http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
76
|
+
"http://en.wikipedia.org//bits.wikimedia.org/images/wikimedia-button.png",
|
77
|
+
"http://en.wikipedia.org//bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
78
|
+
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
|
+
assert_equal images, scraper.image_urls
|
80
|
+
end
|
81
|
+
|
82
|
+
should "Handle a page image with an unescaped url" do
|
83
|
+
scraper = ImageScraper::Client.new ''
|
84
|
+
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
|
85
|
+
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
|
+
end
|
87
|
+
|
88
|
+
should "Handle a stylesheet with an unescaped url" do
|
89
|
+
scraper = ImageScraper::Client.new ''
|
90
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'")
|
91
|
+
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
92
|
+
end
|
93
|
+
|
94
|
+
should "Handle a stylesheet image with an unescaped url" do
|
95
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
96
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
97
|
+
end
|
62
98
|
end
|
metadata
CHANGED
@@ -5,8 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
|
8
|
+
- 5
|
9
|
+
segments_generated: true
|
10
|
+
version: 0.1.5
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- John McAliley
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-
|
18
|
+
date: 2011-11-30 00:00:00 -05:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -26,6 +27,7 @@ dependencies:
|
|
26
27
|
- !ruby/object:Gem::Version
|
27
28
|
segments:
|
28
29
|
- 0
|
30
|
+
segments_generated: true
|
29
31
|
version: "0"
|
30
32
|
type: :runtime
|
31
33
|
prerelease: false
|
@@ -39,6 +41,7 @@ dependencies:
|
|
39
41
|
- !ruby/object:Gem::Version
|
40
42
|
segments:
|
41
43
|
- 0
|
44
|
+
segments_generated: true
|
42
45
|
version: "0"
|
43
46
|
type: :runtime
|
44
47
|
prerelease: false
|
@@ -52,6 +55,7 @@ dependencies:
|
|
52
55
|
- !ruby/object:Gem::Version
|
53
56
|
segments:
|
54
57
|
- 0
|
58
|
+
segments_generated: true
|
55
59
|
version: "0"
|
56
60
|
type: :runtime
|
57
61
|
prerelease: false
|
@@ -65,6 +69,7 @@ dependencies:
|
|
65
69
|
- !ruby/object:Gem::Version
|
66
70
|
segments:
|
67
71
|
- 0
|
72
|
+
segments_generated: true
|
68
73
|
version: "0"
|
69
74
|
type: :development
|
70
75
|
prerelease: false
|
@@ -80,6 +85,7 @@ dependencies:
|
|
80
85
|
- 1
|
81
86
|
- 0
|
82
87
|
- 0
|
88
|
+
segments_generated: true
|
83
89
|
version: 1.0.0
|
84
90
|
type: :development
|
85
91
|
prerelease: false
|
@@ -95,6 +101,7 @@ dependencies:
|
|
95
101
|
- 1
|
96
102
|
- 5
|
97
103
|
- 2
|
104
|
+
segments_generated: true
|
98
105
|
version: 1.5.2
|
99
106
|
type: :development
|
100
107
|
prerelease: false
|
@@ -108,6 +115,7 @@ dependencies:
|
|
108
115
|
- !ruby/object:Gem::Version
|
109
116
|
segments:
|
110
117
|
- 0
|
118
|
+
segments_generated: true
|
111
119
|
version: "0"
|
112
120
|
type: :development
|
113
121
|
prerelease: false
|
@@ -121,6 +129,7 @@ dependencies:
|
|
121
129
|
- !ruby/object:Gem::Version
|
122
130
|
segments:
|
123
131
|
- 0
|
132
|
+
segments_generated: true
|
124
133
|
version: "0"
|
125
134
|
type: :runtime
|
126
135
|
prerelease: false
|
@@ -134,6 +143,7 @@ dependencies:
|
|
134
143
|
- !ruby/object:Gem::Version
|
135
144
|
segments:
|
136
145
|
- 0
|
146
|
+
segments_generated: true
|
137
147
|
version: "0"
|
138
148
|
type: :runtime
|
139
149
|
prerelease: false
|
@@ -175,9 +185,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
175
185
|
requirements:
|
176
186
|
- - ">="
|
177
187
|
- !ruby/object:Gem::Version
|
178
|
-
hash: -
|
188
|
+
hash: -4020311873679732909
|
179
189
|
segments:
|
180
190
|
- 0
|
191
|
+
segments_generated: true
|
181
192
|
version: "0"
|
182
193
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
194
|
none: false
|
@@ -186,6 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
186
197
|
- !ruby/object:Gem::Version
|
187
198
|
segments:
|
188
199
|
- 0
|
200
|
+
segments_generated: true
|
189
201
|
version: "0"
|
190
202
|
requirements: []
|
191
203
|
|