image_scraper 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.6
1
+ 0.1.7
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-12-22}
12
+ s.date = %q{2012-01-02}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -1,7 +1,7 @@
1
1
  module ImageScraper
2
2
  class Client
3
3
  attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
4
-
4
+
5
5
  def initialize(url,options={})
6
6
  options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
7
  @url = URI.escape(url)
@@ -11,31 +11,31 @@ module ImageScraper
11
11
  html = open(@url).read rescue nil
12
12
  @doc = html ? Nokogiri::HTML(html) : nil
13
13
  end
14
-
14
+
15
15
  def image_urls
16
16
  images = page_images
17
17
  images += stylesheet_images if include_css_images
18
18
  images
19
19
  end
20
-
20
+
21
21
  def page_images
22
22
  urls = []
23
23
  return urls if doc.blank?
24
24
  doc.xpath("//img").each do |img|
25
25
  next if img["src"].blank?
26
- image = URI.escape(img["src"])
26
+ image = URI.escape(img["src"].strip)
27
27
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
28
28
  urls << image
29
29
  end
30
30
  urls
31
31
  end
32
-
32
+
33
33
  def stylesheet_images
34
34
  images = []
35
35
  stylesheets.each do |stylesheet|
36
36
  file = open(stylesheet)
37
37
  css = file.string rescue IO.read(file)
38
-
38
+
39
39
  images += css.scan(/url\((.*?)\)/).collect do |image_url|
40
40
  image_url = URI.escape image_url[0]
41
41
  if image_url.include?("data:image") and @include_css_data_images
@@ -48,7 +48,7 @@ module ImageScraper
48
48
  end
49
49
  images
50
50
  end
51
-
51
+
52
52
  def stylesheets
53
53
  return [] if doc.blank?
54
54
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
14
14
  assert_equal images, scraper.image_urls
15
15
  end
16
16
 
17
+ should "return a list of images with whitespace stripped from the src" do
18
+ client = ImageScraper::Client.new("http://www.google.com")
19
+ html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
20
+ client.doc = Nokogiri::HTML(html)
21
+ images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
22
+ assert_equal images, client.image_urls
23
+ end
24
+
17
25
  should "return list of all image urls on a web page with relative paths" do
18
26
  images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
19
27
  "//bits.wikimedia.org/images/wikimedia-button.png",
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
27
35
  domain = "http://test.com"
28
36
  assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
29
37
  end
30
-
38
+
31
39
  should "return proper absolute url for a page and asset" do
32
40
  assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
33
41
  assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
39
47
  assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
40
48
  assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
41
49
  end
42
-
50
+
43
51
  should "return images from a stylesheet" do
44
52
  scraper = ImageScraper::Client.new("http://couponshack.com")
45
53
  assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
46
54
  end
47
-
55
+
48
56
  should "strip quotes from a url" do
49
57
  assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
50
58
  assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
51
59
  assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
52
60
  end
53
-
61
+
54
62
  should "return domain section from a url" do
55
63
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
56
64
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
57
65
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
58
66
  assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
59
67
  end
60
-
68
+
61
69
  should "return nil for doc if URL is invalid" do
62
70
  scraper = ImageScraper::Client.new("couponshack.com")
63
71
  assert scraper.doc.nil?
64
72
  end
65
-
73
+
66
74
  should "return empty arrays if URL is invalid" do
67
75
  scraper = ImageScraper::Client.new("couponshack.com")
68
76
  assert_equal [], scraper.image_urls
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
78
86
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
87
  assert_equal images, scraper.image_urls
80
88
  end
81
-
89
+
82
90
  should "Handle a page image with an unescaped url" do
83
91
  scraper = ImageScraper::Client.new ''
84
92
  scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
85
93
  assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
- end
87
-
94
+ end
95
+
88
96
  should "Handle a stylesheet with an unescaped url" do
89
97
  scraper = ImageScraper::Client.new ''
90
98
  scraper.url = 'http://test.com'
91
99
  scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
92
100
  assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
93
- end
94
-
101
+ end
102
+
95
103
  should "Handle a stylesheet image with an unescaped url" do
96
104
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
97
105
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
98
- end
99
-
106
+ end
107
+
100
108
  should "Handle a stylesheet image with a relative url" do
101
109
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
102
110
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
103
- end
111
+ end
104
112
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 6
8
+ - 7
9
9
  segments_generated: true
10
- version: 0.1.6
10
+ version: 0.1.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - John McAliley
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-12-22 00:00:00 -05:00
18
+ date: 2012-01-02 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
185
  requirements:
186
186
  - - ">="
187
187
  - !ruby/object:Gem::Version
188
- hash: -3072759905091488701
188
+ hash: -168406416917257246
189
189
  segments:
190
190
  - 0
191
191
  segments_generated: true