image_scraper 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.6
1
+ 0.1.7
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-12-22}
12
+ s.date = %q{2012-01-02}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -1,7 +1,7 @@
1
1
  module ImageScraper
2
2
  class Client
3
3
  attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
4
-
4
+
5
5
  def initialize(url,options={})
6
6
  options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
7
  @url = URI.escape(url)
@@ -11,31 +11,31 @@ module ImageScraper
11
11
  html = open(@url).read rescue nil
12
12
  @doc = html ? Nokogiri::HTML(html) : nil
13
13
  end
14
-
14
+
15
15
  def image_urls
16
16
  images = page_images
17
17
  images += stylesheet_images if include_css_images
18
18
  images
19
19
  end
20
-
20
+
21
21
  def page_images
22
22
  urls = []
23
23
  return urls if doc.blank?
24
24
  doc.xpath("//img").each do |img|
25
25
  next if img["src"].blank?
26
- image = URI.escape(img["src"])
26
+ image = URI.escape(img["src"].strip)
27
27
  image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
28
28
  urls << image
29
29
  end
30
30
  urls
31
31
  end
32
-
32
+
33
33
  def stylesheet_images
34
34
  images = []
35
35
  stylesheets.each do |stylesheet|
36
36
  file = open(stylesheet)
37
37
  css = file.string rescue IO.read(file)
38
-
38
+
39
39
  images += css.scan(/url\((.*?)\)/).collect do |image_url|
40
40
  image_url = URI.escape image_url[0]
41
41
  if image_url.include?("data:image") and @include_css_data_images
@@ -48,7 +48,7 @@ module ImageScraper
48
48
  end
49
49
  images
50
50
  end
51
-
51
+
52
52
  def stylesheets
53
53
  return [] if doc.blank?
54
54
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
@@ -14,6 +14,14 @@ class TestImageScraper < Test::Unit::TestCase
14
14
  assert_equal images, scraper.image_urls
15
15
  end
16
16
 
17
+ should "return a list of images with whitespace stripped from the src" do
18
+ client = ImageScraper::Client.new("http://www.google.com")
19
+ html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
20
+ client.doc = Nokogiri::HTML(html)
21
+ images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
22
+ assert_equal images, client.image_urls
23
+ end
24
+
17
25
  should "return list of all image urls on a web page with relative paths" do
18
26
  images = ["//bits.wikimedia.org/skins-1.18/vector/images/search-ltr.png?303-4",
19
27
  "//bits.wikimedia.org/images/wikimedia-button.png",
@@ -27,7 +35,7 @@ class TestImageScraper < Test::Unit::TestCase
27
35
  domain = "http://test.com"
28
36
  assert_equal ["http://test.com/phoenix/testcentral.css","http://test.com/engine1/style.css"], ImageScraper::Client.new("http://test.com").stylesheets
29
37
  end
30
-
38
+
31
39
  should "return proper absolute url for a page and asset" do
32
40
  assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
33
41
  assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
@@ -39,30 +47,30 @@ class TestImageScraper < Test::Unit::TestCase
39
47
  assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
40
48
  assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
41
49
  end
42
-
50
+
43
51
  should "return images from a stylesheet" do
44
52
  scraper = ImageScraper::Client.new("http://couponshack.com")
45
53
  assert scraper.stylesheet_images.include? ("http://couponshack.com/images/bg.jpg")
46
54
  end
47
-
55
+
48
56
  should "strip quotes from a url" do
49
57
  assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
50
58
  assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
51
59
  assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
52
60
  end
53
-
61
+
54
62
  should "return domain section from a url" do
55
63
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
56
64
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
57
65
  assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
58
66
  assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
59
67
  end
60
-
68
+
61
69
  should "return nil for doc if URL is invalid" do
62
70
  scraper = ImageScraper::Client.new("couponshack.com")
63
71
  assert scraper.doc.nil?
64
72
  end
65
-
73
+
66
74
  should "return empty arrays if URL is invalid" do
67
75
  scraper = ImageScraper::Client.new("couponshack.com")
68
76
  assert_equal [], scraper.image_urls
@@ -78,27 +86,27 @@ class TestImageScraper < Test::Unit::TestCase
78
86
  scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard test image",:include_css_images=>false)
79
87
  assert_equal images, scraper.image_urls
80
88
  end
81
-
89
+
82
90
  should "Handle a page image with an unescaped url" do
83
91
  scraper = ImageScraper::Client.new ''
84
92
  scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
85
93
  assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
86
- end
87
-
94
+ end
95
+
88
96
  should "Handle a stylesheet with an unescaped url" do
89
97
  scraper = ImageScraper::Client.new ''
90
98
  scraper.url = 'http://test.com'
91
99
  scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
92
100
  assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
93
- end
94
-
101
+ end
102
+
95
103
  should "Handle a stylesheet image with an unescaped url" do
96
104
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
97
105
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
98
- end
99
-
106
+ end
107
+
100
108
  should "Handle a stylesheet image with a relative url" do
101
109
  scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
102
110
  assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
103
- end
111
+ end
104
112
  end
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 6
8
+ - 7
9
9
  segments_generated: true
10
- version: 0.1.6
10
+ version: 0.1.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - John McAliley
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-12-22 00:00:00 -05:00
18
+ date: 2012-01-02 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -185,7 +185,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
185
185
  requirements:
186
186
  - - ">="
187
187
  - !ruby/object:Gem::Version
188
- hash: -3072759905091488701
188
+ hash: -168406416917257246
189
189
  segments:
190
190
  - 0
191
191
  segments_generated: true