image_scraper 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -3,6 +3,8 @@ source "http://rubygems.org"
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
5
  gem "nokogiri"
6
+ gem "css_parser"
7
+ gem "rails"
6
8
  # Add dependencies to develop your gem here.
7
9
  # Include everything needed to run rake, tests, features, etc.
8
10
  group :development do
@@ -1,22 +1,86 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ abstract (1.0.0)
5
+ actionmailer (3.0.7)
6
+ actionpack (= 3.0.7)
7
+ mail (~> 2.2.15)
8
+ actionpack (3.0.7)
9
+ activemodel (= 3.0.7)
10
+ activesupport (= 3.0.7)
11
+ builder (~> 2.1.2)
12
+ erubis (~> 2.6.6)
13
+ i18n (~> 0.5.0)
14
+ rack (~> 1.2.1)
15
+ rack-mount (~> 0.6.14)
16
+ rack-test (~> 0.5.7)
17
+ tzinfo (~> 0.3.23)
18
+ activemodel (3.0.7)
19
+ activesupport (= 3.0.7)
20
+ builder (~> 2.1.2)
21
+ i18n (~> 0.5.0)
22
+ activerecord (3.0.7)
23
+ activemodel (= 3.0.7)
24
+ activesupport (= 3.0.7)
25
+ arel (~> 2.0.2)
26
+ tzinfo (~> 0.3.23)
27
+ activeresource (3.0.7)
28
+ activemodel (= 3.0.7)
29
+ activesupport (= 3.0.7)
30
+ activesupport (3.0.7)
31
+ arel (2.0.10)
32
+ builder (2.1.2)
33
+ css_parser (1.1.9)
34
+ erubis (2.6.6)
35
+ abstract (>= 1.0.0)
4
36
  git (1.2.5)
37
+ i18n (0.5.0)
5
38
  jeweler (1.5.2)
6
39
  bundler (~> 1.0.0)
7
40
  git (>= 1.2.5)
8
41
  rake
42
+ mail (2.2.19)
43
+ activesupport (>= 2.3.6)
44
+ i18n (>= 0.4.0)
45
+ mime-types (~> 1.16)
46
+ treetop (~> 1.4.8)
47
+ mime-types (1.16)
9
48
  nokogiri (1.4.4)
49
+ polyglot (0.3.1)
50
+ rack (1.2.3)
51
+ rack-mount (0.6.14)
52
+ rack (>= 1.0.0)
53
+ rack-test (0.5.7)
54
+ rack (>= 1.0)
55
+ rails (3.0.7)
56
+ actionmailer (= 3.0.7)
57
+ actionpack (= 3.0.7)
58
+ activerecord (= 3.0.7)
59
+ activeresource (= 3.0.7)
60
+ activesupport (= 3.0.7)
61
+ bundler (~> 1.0)
62
+ railties (= 3.0.7)
63
+ railties (3.0.7)
64
+ actionpack (= 3.0.7)
65
+ activesupport (= 3.0.7)
66
+ rake (>= 0.8.7)
67
+ thor (~> 0.14.4)
10
68
  rake (0.9.0)
11
69
  rcov (0.9.9)
12
70
  shoulda (2.11.3)
71
+ thor (0.14.6)
72
+ treetop (1.4.9)
73
+ polyglot (>= 0.3.1)
74
+ tzinfo (0.3.27)
13
75
 
14
76
  PLATFORMS
15
77
  ruby
16
78
 
17
79
  DEPENDENCIES
18
80
  bundler (~> 1.0.0)
81
+ css_parser
19
82
  jeweler (~> 1.5.2)
20
83
  nokogiri
84
+ rails
21
85
  rcov
22
86
  shoulda
data/README.md CHANGED
@@ -14,17 +14,23 @@ Install w/ Bundler
14
14
 
15
15
  ## USAGE
16
16
 
17
- Pull all the image URLs from a specified URL and convert all images URLs to absolute paths
17
+ Initialize the image scraper client
18
18
 
19
- ImageScraper.image_urls("http://www.rubygems.org")
19
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org")
20
20
 
21
- Pull all images URLs from specified URL and use relative URLs
21
+ You can also pass an options hash to the client when you initialize it:
22
22
 
23
- ImageScraper.image_urls("http://www.rubygems.org",false)
23
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org", options)
24
+ # OPTIONS - If you don't pass the option, it will default to true
25
+ # :convert_to_absolute_url - If there are relative image URLS, it will convert them to absolute URLS.
26
+ # :include_css_images - If there are stylesheets on the page, it will pull images out of the stylesheet. For example: background: url(/images/some-image.png).
27
+ # :include_css_data_images - Will include data images from CSS. For example: data:image/gif;base64,R0lGODlhEAAOALMAAOazToeH............
28
+
29
+ Get the images from the url specified when you initialized the client:
24
30
 
25
- ## TODO
31
+ image_scraper.image_urls
26
32
 
27
- 1. Parse CSS files for images
33
+ This will return an array of strings.
28
34
 
29
35
  ## Contributing to image_scraper
30
36
 
data/Rakefile CHANGED
@@ -20,6 +20,7 @@ Jeweler::Tasks.new do |gem|
20
20
  gem.email = "john.mcaliley@gmail.com"
21
21
  gem.authors = ["John McAliley"]
22
22
  gem.add_dependency "nokogiri"
23
+ gem.add_dependency "css_parser"
23
24
  end
24
25
  Jeweler::RubygemsDotOrgTasks.new
25
26
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.1.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.0.2"
8
+ s.version = "0.1.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-05-27}
12
+ s.date = %q{2011-06-02}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
27
27
  "image_scraper.gemspec",
28
28
  "lib/image_scraper.rb",
29
29
  "test/helper.rb",
30
+ "test/resources/stylesheet_test.html",
30
31
  "test/test_image_scraper.rb"
31
32
  ]
32
33
  s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
@@ -45,26 +46,35 @@ Gem::Specification.new do |s|
45
46
 
46
47
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
47
48
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
49
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
50
+ s.add_runtime_dependency(%q<rails>, [">= 0"])
48
51
  s.add_development_dependency(%q<shoulda>, [">= 0"])
49
52
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
50
53
  s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
51
54
  s.add_development_dependency(%q<rcov>, [">= 0"])
52
55
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
53
57
  else
54
58
  s.add_dependency(%q<nokogiri>, [">= 0"])
59
+ s.add_dependency(%q<css_parser>, [">= 0"])
60
+ s.add_dependency(%q<rails>, [">= 0"])
55
61
  s.add_dependency(%q<shoulda>, [">= 0"])
56
62
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
57
63
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
58
64
  s.add_dependency(%q<rcov>, [">= 0"])
59
65
  s.add_dependency(%q<nokogiri>, [">= 0"])
66
+ s.add_dependency(%q<css_parser>, [">= 0"])
60
67
  end
61
68
  else
62
69
  s.add_dependency(%q<nokogiri>, [">= 0"])
70
+ s.add_dependency(%q<css_parser>, [">= 0"])
71
+ s.add_dependency(%q<rails>, [">= 0"])
63
72
  s.add_dependency(%q<shoulda>, [">= 0"])
64
73
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
65
74
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
66
75
  s.add_dependency(%q<rcov>, [">= 0"])
67
76
  s.add_dependency(%q<nokogiri>, [">= 0"])
77
+ s.add_dependency(%q<css_parser>, [">= 0"])
68
78
  end
69
79
  end
70
80
 
@@ -1,20 +1,80 @@
1
+ require 'pp'
2
+ require 'rails'
1
3
  require 'open-uri'
2
4
  require 'nokogiri'
3
5
 
4
6
  module ImageScraper
5
- class Railtie < Rails::Railtie
7
+ class Client
8
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
9
+
10
+ def initialize(url,options={})
11
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
12
+ @url = url
13
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
14
+ @include_css_images = options[:include_css_images]
15
+ @include_css_data_images = options[:include_css_data_images]
16
+ @doc = Nokogiri::HTML(open url)
17
+ end
18
+
19
+ def image_urls
20
+ images = page_images
21
+ images += stylesheet_images if include_css_images
22
+ images
23
+ end
24
+
25
+ def page_images
26
+ urls = []
27
+ doc.xpath("//img").each do |img|
28
+ image = img["src"]
29
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
30
+ urls << image
31
+ end
32
+ urls
33
+ end
34
+
35
+ def stylesheet_images
36
+ images = []
37
+ stylesheets.each do |stylesheet|
38
+ file = open(stylesheet)
39
+ css = file.string rescue IO.read(file)
40
+
41
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
42
+ if image_url.include?("data:image") and @include_css_data_images
43
+ image_url[0]
44
+ else
45
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
46
+ end
47
+ end
48
+ end
49
+ images
50
+ end
51
+
52
+ def stylesheets
53
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
+ ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
+ end
56
+ end
6
57
  end
7
-
8
- def self.image_urls(url, convert_to_absolute_url=true)
9
- uri = URI.parse(url)
10
- domain = "#{uri.scheme}://#{uri.host}"
11
- doc = Nokogiri::HTML(open url)
12
- urls = []
13
- doc.xpath("//img").each do |img|
14
- image = img["src"]
15
- image = domain + image if convert_to_absolute_url and !image.include?("://")
16
- urls << image
17
- end
18
- urls
58
+
59
+ module Util
60
+ def self.absolute_url(url,asset=nil)
61
+ return domain(url) + path(url) if asset.nil? and asset.include("://")
62
+ return asset if asset.include?("://")
63
+ return domain(url)+asset if asset[0]=="/"
64
+ return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
65
+ end
66
+
67
+ def self.domain(url)
68
+ uri = URI.parse(url)
69
+ "#{uri.scheme}://#{uri.host}"
70
+ end
71
+
72
+ def self.path(url)
73
+ uri = URI.parse(url)
74
+ uri.path
75
+ end
76
+ end
77
+
78
+ class Railtie < Rails::Railtie
19
79
  end
20
80
  end
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>stylesheet_test</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="John McAliley">
10
+ <link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
11
+ <link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
12
+ <!-- Date: 2011-06-01 -->
13
+ </head>
14
+ <body>
15
+
16
+ </body>
17
+ </html>
@@ -1,13 +1,47 @@
1
+ require 'pp'
1
2
  require 'helper'
2
3
 
4
+
5
+ #TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
3
6
  class TestImageScraper < Test::Unit::TestCase
4
7
  should "return list of all image urls on a web page with absolute paths" do
5
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png", "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png", "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2", "http://en.wikipedia.org/images/wikimedia-button.png", "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
6
- assert_equal ImageScraper.image_urls("http://en.wikipedia.org/wiki/Standard_test_image"), images
8
+ images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
9
+ "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
10
+ "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2",
11
+ "http://en.wikipedia.org/images/wikimedia-button.png",
12
+ "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
13
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
+ assert_equal images, scraper.image_urls
7
15
  end
8
-
16
+
9
17
  should "return list of all image urls on a web page with relative paths" do
10
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png","http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png","http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2","/images/wikimedia-button.png","http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
11
- assert_equal ImageScraper.image_urls("http://en.wikipedia.org/wiki/Standard_test_image",false), images
12
- end
13
- end
18
+ images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
19
+ "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
20
+ "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2",
21
+ "/images/wikimedia-button.png",
22
+ "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
23
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
24
+ assert_equal images, scraper.image_urls
25
+ end
26
+
27
+ should "return list of stylesheets contained in html page (relative path)" do
28
+ doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
29
+ domain = "http://test.com"
30
+ assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
31
+ end
32
+
33
+ should "return proper absolute url for a page and asset" do
34
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
35
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
36
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","/images/image.gif")
37
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","image.gif")
38
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
39
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","images/image.gif")
40
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
41
+ end
42
+
43
+ should "return images from a stylesheet" do
44
+ scraper = ImageScraper::Client.new("http://local.couponshack.com")
45
+ assert scraper.stylesheet_images.include? ("http://local.couponshack.com/images/bg.png")
46
+ end
47
+ end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 2
9
- version: 0.0.2
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - John McAliley
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-27 00:00:00 -04:00
17
+ date: 2011-06-02 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -31,7 +31,7 @@ dependencies:
31
31
  prerelease: false
32
32
  version_requirements: *id001
33
33
  - !ruby/object:Gem::Dependency
34
- name: shoulda
34
+ name: css_parser
35
35
  requirement: &id002 !ruby/object:Gem::Requirement
36
36
  none: false
37
37
  requirements:
@@ -40,12 +40,38 @@ dependencies:
40
40
  segments:
41
41
  - 0
42
42
  version: "0"
43
- type: :development
43
+ type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: *id002
46
46
  - !ruby/object:Gem::Dependency
47
- name: bundler
47
+ name: rails
48
48
  requirement: &id003 !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: shoulda
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ type: :development
70
+ prerelease: false
71
+ version_requirements: *id004
72
+ - !ruby/object:Gem::Dependency
73
+ name: bundler
74
+ requirement: &id005 !ruby/object:Gem::Requirement
49
75
  none: false
50
76
  requirements:
51
77
  - - ~>
@@ -57,10 +83,10 @@ dependencies:
57
83
  version: 1.0.0
58
84
  type: :development
59
85
  prerelease: false
60
- version_requirements: *id003
86
+ version_requirements: *id005
61
87
  - !ruby/object:Gem::Dependency
62
88
  name: jeweler
63
- requirement: &id004 !ruby/object:Gem::Requirement
89
+ requirement: &id006 !ruby/object:Gem::Requirement
64
90
  none: false
65
91
  requirements:
66
92
  - - ~>
@@ -72,10 +98,10 @@ dependencies:
72
98
  version: 1.5.2
73
99
  type: :development
74
100
  prerelease: false
75
- version_requirements: *id004
101
+ version_requirements: *id006
76
102
  - !ruby/object:Gem::Dependency
77
103
  name: rcov
78
- requirement: &id005 !ruby/object:Gem::Requirement
104
+ requirement: &id007 !ruby/object:Gem::Requirement
79
105
  none: false
80
106
  requirements:
81
107
  - - ">="
@@ -85,10 +111,10 @@ dependencies:
85
111
  version: "0"
86
112
  type: :development
87
113
  prerelease: false
88
- version_requirements: *id005
114
+ version_requirements: *id007
89
115
  - !ruby/object:Gem::Dependency
90
116
  name: nokogiri
91
- requirement: &id006 !ruby/object:Gem::Requirement
117
+ requirement: &id008 !ruby/object:Gem::Requirement
92
118
  none: false
93
119
  requirements:
94
120
  - - ">="
@@ -98,7 +124,20 @@ dependencies:
98
124
  version: "0"
99
125
  type: :runtime
100
126
  prerelease: false
101
- version_requirements: *id006
127
+ version_requirements: *id008
128
+ - !ruby/object:Gem::Dependency
129
+ name: css_parser
130
+ requirement: &id009 !ruby/object:Gem::Requirement
131
+ none: false
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ segments:
136
+ - 0
137
+ version: "0"
138
+ type: :runtime
139
+ prerelease: false
140
+ version_requirements: *id009
102
141
  description: Simple utility to pull image urls from web page
103
142
  email: john.mcaliley@gmail.com
104
143
  executables: []
@@ -119,6 +158,7 @@ files:
119
158
  - image_scraper.gemspec
120
159
  - lib/image_scraper.rb
121
160
  - test/helper.rb
161
+ - test/resources/stylesheet_test.html
122
162
  - test/test_image_scraper.rb
123
163
  has_rdoc: true
124
164
  homepage: http://github.com/charlotte-ruby/image_scraper
@@ -134,7 +174,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
134
174
  requirements:
135
175
  - - ">="
136
176
  - !ruby/object:Gem::Version
137
- hash: -3054709322002535321
177
+ hash: 44385371017236818
138
178
  segments:
139
179
  - 0
140
180
  version: "0"