image_scraper 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/image_scraper.gemspec +2 -2
- data/lib/image_scraper/client.rb +3 -2
- data/lib/image_scraper/util.rb +13 -5
- data/test/test_image_scraper.rb +15 -9
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.6
|
data/image_scraper.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-12-22}
|
13
13
|
s.description = %q{Simple utility to pull image urls from web page}
|
14
14
|
s.email = %q{john.mcaliley@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/image_scraper/client.rb
CHANGED
@@ -22,6 +22,7 @@ module ImageScraper
|
|
22
22
|
urls = []
|
23
23
|
return urls if doc.blank?
|
24
24
|
doc.xpath("//img").each do |img|
|
25
|
+
next if img["src"].blank?
|
25
26
|
image = URI.escape(img["src"])
|
26
27
|
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
27
28
|
urls << image
|
@@ -41,7 +42,7 @@ module ImageScraper
|
|
41
42
|
image_url
|
42
43
|
else
|
43
44
|
image_url = ImageScraper::Util.strip_quotes(image_url)
|
44
|
-
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(
|
45
|
+
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
@@ -51,7 +52,7 @@ module ImageScraper
|
|
51
52
|
def stylesheets
|
52
53
|
return [] if doc.blank?
|
53
54
|
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
54
|
-
|
55
|
+
ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
|
55
56
|
end
|
56
57
|
end
|
57
58
|
end
|
data/lib/image_scraper/util.rb
CHANGED
@@ -1,10 +1,18 @@
|
|
1
1
|
module ImageScraper
|
2
2
|
module Util
|
3
3
|
def self.absolute_url(url,asset=nil)
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
# TODO - what happens when an index redirect occurs?
|
5
|
+
# Example: 'http://example.com/about' specified as url
|
6
|
+
# 'style.css' specified as asset
|
7
|
+
# url redirects to 'http://example.com/about/'
|
8
|
+
# and serves http://example.com/about/index.html
|
9
|
+
# which then links to the relative asset path 'style.css'
|
10
|
+
# based on original url (http://example.com/about),
|
11
|
+
# self.absolute_url gives
|
12
|
+
# 'http://example.com/style.css
|
13
|
+
# but should get:
|
14
|
+
# 'http://example.com/about/style.css
|
15
|
+
URI.parse(url).merge(URI.parse asset.to_s).to_s
|
8
16
|
end
|
9
17
|
|
10
18
|
def self.domain(url)
|
@@ -21,4 +29,4 @@ module ImageScraper
|
|
21
29
|
image_url.gsub("'","").gsub('"','')
|
22
30
|
end
|
23
31
|
end
|
24
|
-
end
|
32
|
+
end
|
data/test/test_image_scraper.rb
CHANGED
@@ -7,9 +7,9 @@ require 'helper'
|
|
7
7
|
|
8
8
|
class TestImageScraper < Test::Unit::TestCase
|
9
9
|
should "return list of all image urls on a web page with absolute paths" do
|
10
|
-
images = ["http://
|
11
|
-
"http://
|
12
|
-
"http://
|
10
|
+
images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
11
|
+
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
12
|
+
"http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
13
13
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
|
14
14
|
assert_equal images, scraper.image_urls
|
15
15
|
end
|
@@ -72,22 +72,23 @@ class TestImageScraper < Test::Unit::TestCase
|
|
72
72
|
end
|
73
73
|
|
74
74
|
should "Handle a URL with unescaped spaces" do
|
75
|
-
images = ["http://
|
76
|
-
"http://
|
77
|
-
"http://
|
75
|
+
images = ["http://bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
|
76
|
+
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
77
|
+
"http://bits.wikimedia.org/skins-1.18/common/images/poweredby_mediawiki_88x31.png"]
|
78
78
|
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
|
79
79
|
assert_equal images, scraper.image_urls
|
80
80
|
end
|
81
81
|
|
82
82
|
should "Handle a page image with an unescaped url" do
|
83
83
|
scraper = ImageScraper::Client.new ''
|
84
|
-
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'")
|
84
|
+
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
|
85
85
|
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
86
86
|
end
|
87
87
|
|
88
88
|
should "Handle a stylesheet with an unescaped url" do
|
89
89
|
scraper = ImageScraper::Client.new ''
|
90
|
-
scraper.
|
90
|
+
scraper.url = 'http://test.com'
|
91
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
|
91
92
|
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
92
93
|
end
|
93
94
|
|
@@ -95,4 +96,9 @@ class TestImageScraper < Test::Unit::TestCase
|
|
95
96
|
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
96
97
|
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
97
98
|
end
|
98
|
-
|
99
|
+
|
100
|
+
should "Handle a stylesheet image with a relative url" do
|
101
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
|
102
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
|
103
|
+
end
|
104
|
+
end
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 6
|
9
9
|
segments_generated: true
|
10
|
-
version: 0.1.
|
10
|
+
version: 0.1.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- John McAliley
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-12-22 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
185
|
requirements:
|
186
186
|
- - ">="
|
187
187
|
- !ruby/object:Gem::Version
|
188
|
-
hash: -
|
188
|
+
hash: -3072759905091488701
|
189
189
|
segments:
|
190
190
|
- 0
|
191
191
|
segments_generated: true
|