image_scraper 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +7 -7
- data/test/test_image_scraper.rb +22 -14
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.7
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.7"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2012-01-02}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module ImageScraper
|
2
2
|
class Client
|
3
3
|
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
|
4
|
-
|
4
|
+
|
5
5
|
def initialize(url,options={})
|
6
6
|
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
7
|
@url = URI.escape(url)
|
@@ -11,31 +11,31 @@ module ImageScraper
|
|
11
11
|
html = open(@url).read rescue nil
|
12
12
|
@doc = html ? Nokogiri::HTML(html) : nil
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
def image_urls
|
16
16
|
images = page_images
|
17
17
|
images += stylesheet_images if include_css_images
|
18
18
|
images
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
def page_images
|
22
22
|
urls = []
|
23
23
|
return urls if doc.blank?
|
24
24
|
doc.xpath("//img").each do |img|
|
25
25
|
next if img["src"].blank?
|
26
|
-
image = URI.escape(img["src"])
|
26
|
+
image = URI.escape(img["src"].strip)
|
27
27
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
28
28
|
urls << image
|
29
29
|
end
|
30
30
|
urls
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
def stylesheet_images
|
34
34
|
images = []
|
35
35
|
stylesheets.each do |stylesheet|
|
36
36
|
file = open(stylesheet)
|
37
37
|
css = file.string rescue IO.read(file)
|
38
|
-
|
38
|
+
|
39
39
|
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
40
40
|
image_url = URI.escape image_url[0]
|
41
41
|
if image_url.include?("data:image") and @include_css_data_images
|
@@ -48,7 +48,7 @@ module ImageScraper
|
|
48
48
|
end
|
49
49
|
images
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
def stylesheets
|
53
53
|
return [] if doc.blank?
|
54
54
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
data/test/test_image_scraper.rb
CHANGED
@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
|
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
16
16
|
|
17
|
+
should "return a list of images with whitespace stripped from the src" do
|
18
|
+
client = ImageScraper::Client.new("http://www.google.com")
|
19
|
+
html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
|
20
|
+
client.doc = Nokogiri::HTML(html)
|
21
|
+
images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
|
22
|
+
assert_equal images, client.image_urls
|
23
|
+
end
|
24
|
+
|
17
25
|
should "return list of all image urls on a web page with relative paths" do
|
18
26
|
images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
19
27
|
"//bits.wikimedia.org/images/wikimedia-button.png",
|
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
|
|
27
35
|
domain = "http://test.com"
|
28
36
|
assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
|
29
37
|
end
|
30
|
-
|
38
|
+
|
31
39
|
should "return proper absolute url for a page and asset" do
|
32
40
|
assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
|
33
41
|
assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
|
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
|
|
39
47
|
assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
|
40
48
|
assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
|
41
49
|
end
|
42
|
-
|
50
|
+
|
43
51
|
should "return images from a stylesheet" do
|
44
52
|
scraper = ImageScraper::Client.new("http://couponshack.com")
|
45
53
|
assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
|
46
54
|
end
|
47
|
-
|
55
|
+
|
48
56
|
should "strip quotes from a url" do
|
49
57
|
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
|
50
58
|
assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
|
51
59
|
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
|
52
60
|
end
|
53
|
-
|
61
|
+
|
54
62
|
should "return domain section from a url" do
|
55
63
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
|
56
64
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
|
57
65
|
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
|
58
66
|
assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
|
59
67
|
end
|
60
|
-
|
68
|
+
|
61
69
|
should "return nil for doc if URL is invalid" do
|
62
70
|
scraper = ImageScraper::Client.new("couponshack.com")
|
63
71
|
assert scraper.doc.nil?
|
64
72
|
end
|
65
|
-
|
73
|
+
|
66
74
|
should "return empty arrays if URL is invalid" do
|
67
75
|
scraper = ImageScraper::Client.new("couponshack.com")
|
68
76
|
assert_equal [], scraper.image_urls
|
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
|
|
78
86
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
87
|
assert_equal images, scraper.image_urls
|
80
88
|
end
|
81
|
-
|
89
|
+
|
82
90
|
should "Handle a page image with an unescaped url" do
|
83
91
|
scraper = ImageScraper::Client.new ''
|
84
92
|
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
|
85
93
|
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
|
-
end
|
87
|
-
|
94
|
+
end
|
95
|
+
|
88
96
|
should "Handle a stylesheet with an unescaped url" do
|
89
97
|
scraper = ImageScraper::Client.new ''
|
90
98
|
scraper.url = 'http://test.com'
|
91
99
|
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
|
92
100
|
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
93
|
-
end
|
94
|
-
|
101
|
+
end
|
102
|
+
|
95
103
|
should "Handle a stylesheet image with an unescaped url" do
|
96
104
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
97
105
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
98
|
-
end
|
99
|
-
|
106
|
+
end
|
107
|
+
|
100
108
|
should "Handle a stylesheet image with a relative url" do
|
101
109
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
|
102
110
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
|
103
|
-
end
|
111
|
+
end
|
104
112
|
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 7
|
9
9
|
segments_generated: true
|
10
|
-
version: 0.1.
|
10
|
+
version: 0.1.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- John McAliley
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2012-01-02 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
185
|
requirements:
|
186
186
|
- - ">="
|
187
187
|
- !ruby/object:Gem::Version
|
188
|
-
hash: -
|
188
|
+
hash: -168406416917257246
|
189
189
|
segments:
|
190
190
|
- 0
|
191
191
|
segments_generated: true
|