image_scraper 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +7 -7
- data/test/test_image_scraper.rb +22 -14
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.7
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2012-01-02}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module ImageScraper
|
2
2
|
class Client
|
3
3
|
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
|
4
|
-
|
4
|
+
|
5
5
|
def initialize(url,options={})
|
6
6
|
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
7
|
@url = URI.escape(url)
|
@@ -11,31 +11,31 @@ module ImageScraper
|
|
11
11
|
html = open(@url).read rescue nil
|
12
12
|
@doc = html ? Nokogiri::HTML(html) : nil
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def image_urls
|
16
16
|
images = page_images
|
17
17
|
images += stylesheet_images if include_css_images
|
18
18
|
images
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
def page_images
|
22
22
|
urls = []
|
23
23
|
return urls if doc.blank?
|
24
24
|
doc.xpath("//img").each do |img|
|
25
25
|
next if img["src"].blank?
|
26
|
-
image = URI.escape(img["src"])
|
26
|
+
image = URI.escape(img["src"].strip)
|
27
27
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
28
28
|
urls << image
|
29
29
|
end
|
30
30
|
urls
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
def stylesheet_images
|
34
34
|
images = []
|
35
35
|
stylesheets.each do |stylesheet|
|
36
36
|
file = open(stylesheet)
|
37
37
|
css = file.string rescue IO.read(file)
|
38
|
-
|
38
|
+
|
39
39
|
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
40
40
|
image_url = URI.escape image_url[0]
|
41
41
|
if image_url.include?("data:image") and @include_css_data_images
|
@@ -48,7 +48,7 @@ module ImageScraper
|
|
48
48
|
end
|
49
49
|
images
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
def stylesheets
|
53
53
|
return [] if doc.blank?
|
54
54
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
data/test/test_image_scraper.rb
CHANGED
@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
|
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
16
16
|
|
17
|
+
should "return a list of images with whitespace stripped from the src" do
|
18
|
+
client = ImageScraper::Client.new("http://www.google.com")
|
19
|
+
html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
|
20
|
+
client.doc = Nokogiri::HTML(html)
|
21
|
+
images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
|
22
|
+
assert_equal images, client.image_urls
|
23
|
+
end
|
24
|
+
|
17
25
|
should "return list of all image urls on a web page with relative paths" do
|
18
26
|
images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
19
27
|
"//bits.wikimedia.org/images/wikimedia-button.png",
|
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
|
|
27
35
|
domain = "http://test.com"
|
28
36
|
assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
29
37
|
end
|
30
|
-
|
38
|
+
|
31
39
|
should "return proper absolute url for a page and asset" do
|
32
40
|
assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
|
33
41
|
assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
|
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
|
|
39
47
|
assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
|
40
48
|
assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
|
41
49
|
end
|
42
|
-
|
50
|
+
|
43
51
|
should "return images from a stylesheet" do
|
44
52
|
scraper = ImageScraper::Client.new("http://couponshack.com")
|
45
53
|
assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
|
46
54
|
end
|
47
|
-
|
55
|
+
|
48
56
|
should "strip quotes from a url" do
|
49
57
|
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
|
50
58
|
assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
|
51
59
|
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
|
52
60
|
end
|
53
|
-
|
61
|
+
|
54
62
|
should "return domain section from a url" do
|
55
63
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
|
56
64
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
|
57
65
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
|
58
66
|
assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
|
59
67
|
end
|
60
|
-
|
68
|
+
|
61
69
|
should "return nil for doc if URL is invalid" do
|
62
70
|
scraper = ImageScraper::Client.new("couponshack.com")
|
63
71
|
assert scraper.doc.nil?
|
64
72
|
end
|
65
|
-
|
73
|
+
|
66
74
|
should "return empty arrays if URL is invalid" do
|
67
75
|
scraper = ImageScraper::Client.new("couponshack.com")
|
68
76
|
assert_equal [], scraper.image_urls
|
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
|
|
78
86
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
87
|
assert_equal images, scraper.image_urls
|
80
88
|
end
|
81
|
-
|
89
|
+
|
82
90
|
should "Handle a page image with an unescaped url" do
|
83
91
|
scraper = ImageScraper::Client.new ''
|
84
92
|
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
|
85
93
|
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
|
-
end
|
87
|
-
|
94
|
+
end
|
95
|
+
|
88
96
|
should "Handle a stylesheet with an unescaped url" do
|
89
97
|
scraper = ImageScraper::Client.new ''
|
90
98
|
scraper.url = 'http://test.com'
|
91
99
|
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
|
92
100
|
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
93
|
-
end
|
94
|
-
|
101
|
+
end
|
102
|
+
|
95
103
|
should "Handle a stylesheet image with an unescaped url" do
|
96
104
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
97
105
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
98
|
-
end
|
99
|
-
|
106
|
+
end
|
107
|
+
|
100
108
|
should "Handle a stylesheet image with a relative url" do
|
101
109
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
|
102
110
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
|
103
|
-
end
|
111
|
+
end
|
104
112
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 7
|
9
9
|
segments_generated: true
|
10
|
-
version: 0.1.
|
10
|
+
version: 0.1.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- John McAliley
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-02 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
185
|
requirements:
|
186
186
|
- - ">="
|
187
187
|
- !ruby/object:Gem::Version
|
188
|
-
hash: -
|
188
|
+
hash: -168406416917257246
|
189
189
|
segments:
|
190
190
|
- 0
|
191
191
|
segments_generated: true
|