image_scraper 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +3 -2
- data/lib/image_scraper/util.rb +13 -5
- data/test/test_image_scraper.rb +15 -9
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.6
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-12-22}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -22,6 +22,7 @@ module ImageScraper
|
|
22
22
|
urls = []
|
23
23
|
return urls if doc.blank?
|
24
24
|
doc.xpath("//img").each do |img|
|
25
|
+
next if img["src"].blank?
|
25
26
|
image = URI.escape(img["src"])
|
26
27
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
27
28
|
urls << image
|
@@ -41,7 +42,7 @@ module ImageScraper
|
|
41
42
|
image_url
|
42
43
|
else
|
43
44
|
image_url = ImageScraper::Util.strip_quotes(image_url)
|
44
|
-
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(
|
45
|
+
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
@@ -51,7 +52,7 @@ module ImageScraper
|
|
51
52
|
def stylesheets
|
52
53
|
return [] if doc.blank?
|
53
54
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
54
|
-
|
55
|
+
ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
|
55
56
|
end
|
56
57
|
end
|
57
58
|
end
|
data/lib/image_scraper/util.rb
CHANGED
@@ -1,10 +1,18 @@
|
|
1
1
|
module ImageScraper
|
2
2
|
module Util
|
3
3
|
def self.absolute_url(url,asset=nil)
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
# TODO - what happens when an index redirect occurs?
|
5
|
+
# Example: 'http://example.com/about' specified as url
|
6
|
+
# 'style.css' specified as asset
|
7
|
+
# url redirects to 'http://example.com/about/'
|
8
|
+
# and serves http://example.com/about/index.html
|
9
|
+
# which then links to the relative asset path 'style.css'
|
10
|
+
# based on original url (http://example.com/about),
|
11
|
+
# self.absolute_url gives
|
12
|
+
# 'http://example.com/style.css
|
13
|
+
# but should get:
|
14
|
+
# 'http://example.com/about/style.css
|
15
|
+
URI.parse(url).merge(URI.parse asset.to_s).to_s
|
8
16
|
end
|
9
17
|
|
10
18
|
def self.domain(url)
|
@@ -21,4 +29,4 @@ module ImageScraper
|
|
21
29
|
image_url.gsub("'","").gsub('"','')
|
22
30
|
end
|
23
31
|
end
|
24
|
-
end
|
32
|
+
end
|
data/test/test_image_scraper.rb
CHANGED
@@ -7,9 +7,9 @@ require 'helper'
|
|
7
7
|
|
8
8
|
class TestImageScraper < Test::Unit::TestCase
|
9
9
|
should "return list of all image urls on a web page with absolute paths" do
|
10
|
-
images = ["http://
|
11
|
-
"http://
|
12
|
-
"http://
|
10
|
+
images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
11
|
+
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
+
"http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
13
13
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
@@ -72,22 +72,23 @@ class TestImageScraper < Test::Unit::TestCase
|
|
72
72
|
end
|
73
73
|
|
74
74
|
should "Handle a URL with unescaped spaces" do
|
75
|
-
images = ["http://
|
76
|
-
"http://
|
77
|
-
"http://
|
75
|
+
images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
76
|
+
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
77
|
+
"http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
78
78
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
79
|
assert_equal images, scraper.image_urls
|
80
80
|
end
|
81
81
|
|
82
82
|
should "Handle a page image with an unescaped url" do
|
83
83
|
scraper = ImageScraper::Client.new ''
|
84
|
-
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
|
84
|
+
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
|
85
85
|
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
86
|
end
|
87
87
|
|
88
88
|
should "Handle a stylesheet with an unescaped url" do
|
89
89
|
scraper = ImageScraper::Client.new ''
|
90
|
-
scraper.
|
90
|
+
scraper.url = 'http://test.com'
|
91
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
|
91
92
|
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
92
93
|
end
|
93
94
|
|
@@ -95,4 +96,9 @@ class TestImageScraper < Test::Unit::TestCase
|
|
95
96
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
96
97
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
97
98
|
end
|
98
|
-
|
99
|
+
|
100
|
+
should "Handle a stylesheet image with a relative url" do
|
101
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
|
102
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
|
103
|
+
end
|
104
|
+
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 6
|
9
9
|
segments_generated: true
|
10
|
-
version: 0.1.
|
10
|
+
version: 0.1.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- John McAliley
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-12-22 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
185
|
requirements:
|
186
186
|
- - ">="
|
187
187
|
- !ruby/object:Gem::Version
|
188
|
-
hash: -
|
188
|
+
hash: -3072759905091488701
|
189
189
|
segments:
|
190
190
|
- 0
|
191
191
|
segments_generated: true
|