image_scraper 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -3,6 +3,8 @@ source "http://rubygems.org"
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
5
  gem "nokogiri"
6
+ gem "css_parser"
7
+ gem "rails"
6
8
  # Add dependencies to develop your gem here.
7
9
  # Include everything needed to run rake, tests, features, etc.
8
10
  group :development do
@@ -1,22 +1,86 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ abstract (1.0.0)
5
+ actionmailer (3.0.7)
6
+ actionpack (= 3.0.7)
7
+ mail (~> 2.2.15)
8
+ actionpack (3.0.7)
9
+ activemodel (= 3.0.7)
10
+ activesupport (= 3.0.7)
11
+ builder (~> 2.1.2)
12
+ erubis (~> 2.6.6)
13
+ i18n (~> 0.5.0)
14
+ rack (~> 1.2.1)
15
+ rack-mount (~> 0.6.14)
16
+ rack-test (~> 0.5.7)
17
+ tzinfo (~> 0.3.23)
18
+ activemodel (3.0.7)
19
+ activesupport (= 3.0.7)
20
+ builder (~> 2.1.2)
21
+ i18n (~> 0.5.0)
22
+ activerecord (3.0.7)
23
+ activemodel (= 3.0.7)
24
+ activesupport (= 3.0.7)
25
+ arel (~> 2.0.2)
26
+ tzinfo (~> 0.3.23)
27
+ activeresource (3.0.7)
28
+ activemodel (= 3.0.7)
29
+ activesupport (= 3.0.7)
30
+ activesupport (3.0.7)
31
+ arel (2.0.10)
32
+ builder (2.1.2)
33
+ css_parser (1.1.9)
34
+ erubis (2.6.6)
35
+ abstract (>= 1.0.0)
4
36
  git (1.2.5)
37
+ i18n (0.5.0)
5
38
  jeweler (1.5.2)
6
39
  bundler (~> 1.0.0)
7
40
  git (>= 1.2.5)
8
41
  rake
42
+ mail (2.2.19)
43
+ activesupport (>= 2.3.6)
44
+ i18n (>= 0.4.0)
45
+ mime-types (~> 1.16)
46
+ treetop (~> 1.4.8)
47
+ mime-types (1.16)
9
48
  nokogiri (1.4.4)
49
+ polyglot (0.3.1)
50
+ rack (1.2.3)
51
+ rack-mount (0.6.14)
52
+ rack (>= 1.0.0)
53
+ rack-test (0.5.7)
54
+ rack (>= 1.0)
55
+ rails (3.0.7)
56
+ actionmailer (= 3.0.7)
57
+ actionpack (= 3.0.7)
58
+ activerecord (= 3.0.7)
59
+ activeresource (= 3.0.7)
60
+ activesupport (= 3.0.7)
61
+ bundler (~> 1.0)
62
+ railties (= 3.0.7)
63
+ railties (3.0.7)
64
+ actionpack (= 3.0.7)
65
+ activesupport (= 3.0.7)
66
+ rake (>= 0.8.7)
67
+ thor (~> 0.14.4)
10
68
  rake (0.9.0)
11
69
  rcov (0.9.9)
12
70
  shoulda (2.11.3)
71
+ thor (0.14.6)
72
+ treetop (1.4.9)
73
+ polyglot (>= 0.3.1)
74
+ tzinfo (0.3.27)
13
75
 
14
76
  PLATFORMS
15
77
  ruby
16
78
 
17
79
  DEPENDENCIES
18
80
  bundler (~> 1.0.0)
81
+ css_parser
19
82
  jeweler (~> 1.5.2)
20
83
  nokogiri
84
+ rails
21
85
  rcov
22
86
  shoulda
data/README.md CHANGED
@@ -14,17 +14,23 @@ Install w/ Bundler
14
14
 
15
15
  ## USAGE
16
16
 
17
- Pull all the image URLs from a specified URL and convert all images URLs to absolute paths
17
+ Initialize the image scraper client
18
18
 
19
- ImageScraper.image_urls("http://www.rubygems.org")
19
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org")
20
20
 
21
- Pull all images URLs from specified URL and use relative URLs
21
+ You can also pass an options hash to the client when you initialize it:
22
22
 
23
- ImageScraper.image_urls("http://www.rubygems.org",false)
23
+ image_scraper = ImageScraper::Client.new("http://www.rubygems.org", options)
24
+ # OPTIONS - If you don't pass the option, it will default to true
25
+ # :convert_to_absolute_url - If there are relative image URLS, it will convert them to absolute URLS.
26
+ # :include_css_images - If there are stylesheets on the page, it will pull images out of the stylesheet. For example: background: url(/images/some-image.png).
27
+ # :include_css_data_images - Will include data images from CSS. For example: ............
28
+
29
+ Get the images from the url specified when you initialized the client:
24
30
 
25
- ## TODO
31
+ image_scraper.image_urls
26
32
 
27
- 1. Parse CSS files for images
33
+ This will return an array of strings.
28
34
 
29
35
  ## Contributing to image_scraper
30
36
 
data/Rakefile CHANGED
@@ -20,6 +20,7 @@ Jeweler::Tasks.new do |gem|
20
20
  gem.email = "john.mcaliley@gmail.com"
21
21
  gem.authors = ["John McAliley"]
22
22
  gem.add_dependency "nokogiri"
23
+ gem.add_dependency "css_parser"
23
24
  end
24
25
  Jeweler::RubygemsDotOrgTasks.new
25
26
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.1.0
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.0.2"
8
+ s.version = "0.1.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
12
- s.date = %q{2011-05-27}
12
+ s.date = %q{2011-06-02}
13
13
  s.description = %q{Simple utility to pull image urls from web page}
14
14
  s.email = %q{john.mcaliley@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -27,6 +27,7 @@ Gem::Specification.new do |s|
27
27
  "image_scraper.gemspec",
28
28
  "lib/image_scraper.rb",
29
29
  "test/helper.rb",
30
+ "test/resources/stylesheet_test.html",
30
31
  "test/test_image_scraper.rb"
31
32
  ]
32
33
  s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
@@ -45,26 +46,35 @@ Gem::Specification.new do |s|
45
46
 
46
47
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
47
48
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
49
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
50
+ s.add_runtime_dependency(%q<rails>, [">= 0"])
48
51
  s.add_development_dependency(%q<shoulda>, [">= 0"])
49
52
  s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
50
53
  s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
51
54
  s.add_development_dependency(%q<rcov>, [">= 0"])
52
55
  s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_runtime_dependency(%q<css_parser>, [">= 0"])
53
57
  else
54
58
  s.add_dependency(%q<nokogiri>, [">= 0"])
59
+ s.add_dependency(%q<css_parser>, [">= 0"])
60
+ s.add_dependency(%q<rails>, [">= 0"])
55
61
  s.add_dependency(%q<shoulda>, [">= 0"])
56
62
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
57
63
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
58
64
  s.add_dependency(%q<rcov>, [">= 0"])
59
65
  s.add_dependency(%q<nokogiri>, [">= 0"])
66
+ s.add_dependency(%q<css_parser>, [">= 0"])
60
67
  end
61
68
  else
62
69
  s.add_dependency(%q<nokogiri>, [">= 0"])
70
+ s.add_dependency(%q<css_parser>, [">= 0"])
71
+ s.add_dependency(%q<rails>, [">= 0"])
63
72
  s.add_dependency(%q<shoulda>, [">= 0"])
64
73
  s.add_dependency(%q<bundler>, ["~> 1.0.0"])
65
74
  s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
66
75
  s.add_dependency(%q<rcov>, [">= 0"])
67
76
  s.add_dependency(%q<nokogiri>, [">= 0"])
77
+ s.add_dependency(%q<css_parser>, [">= 0"])
68
78
  end
69
79
  end
70
80
 
@@ -1,20 +1,80 @@
1
+ require 'pp'
2
+ require 'rails'
1
3
  require 'open-uri'
2
4
  require 'nokogiri'
3
5
 
4
6
  module ImageScraper
5
- class Railtie < Rails::Railtie
7
+ class Client
8
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
9
+
10
+ def initialize(url,options={})
11
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
12
+ @url = url
13
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
14
+ @include_css_images = options[:include_css_images]
15
+ @include_css_data_images = options[:include_css_data_images]
16
+ @doc = Nokogiri::HTML(open url)
17
+ end
18
+
19
+ def image_urls
20
+ images = page_images
21
+ images += stylesheet_images if include_css_images
22
+ images
23
+ end
24
+
25
+ def page_images
26
+ urls = []
27
+ doc.xpath("//img").each do |img|
28
+ image = img["src"]
29
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
30
+ urls << image
31
+ end
32
+ urls
33
+ end
34
+
35
+ def stylesheet_images
36
+ images = []
37
+ stylesheets.each do |stylesheet|
38
+ file = open(stylesheet)
39
+ css = file.string rescue IO.read(file)
40
+
41
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
42
+ if image_url.include?("data:image") and @include_css_data_images
43
+ image_url[0]
44
+ else
45
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
46
+ end
47
+ end
48
+ end
49
+ images
50
+ end
51
+
52
+ def stylesheets
53
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
+ ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
+ end
56
+ end
6
57
  end
7
-
8
- def self.image_urls(url, convert_to_absolute_url=true)
9
- uri = URI.parse(url)
10
- domain = "#{uri.scheme}://#{uri.host}"
11
- doc = Nokogiri::HTML(open url)
12
- urls = []
13
- doc.xpath("//img").each do |img|
14
- image = img["src"]
15
- image = domain + image if convert_to_absolute_url and !image.include?("://")
16
- urls << image
17
- end
18
- urls
58
+
59
+ module Util
60
+ def self.absolute_url(url,asset=nil)
61
+ return domain(url) + path(url) if asset.nil? and asset.include("://")
62
+ return asset if asset.include?("://")
63
+ return domain(url)+asset if asset[0]=="/"
64
+ return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
65
+ end
66
+
67
+ def self.domain(url)
68
+ uri = URI.parse(url)
69
+ "#{uri.scheme}://#{uri.host}"
70
+ end
71
+
72
+ def self.path(url)
73
+ uri = URI.parse(url)
74
+ uri.path
75
+ end
76
+ end
77
+
78
+ class Railtie < Rails::Railtie
19
79
  end
20
80
  end
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
+ "http://www.w3.org/TR/html4/strict.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>stylesheet_test</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="John McAliley">
10
+ <link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
11
+ <link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
12
+ <!-- Date: 2011-06-01 -->
13
+ </head>
14
+ <body>
15
+
16
+ </body>
17
+ </html>
@@ -1,13 +1,47 @@
1
+ require 'pp'
1
2
  require 'helper'
2
3
 
4
+
5
+ #TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
3
6
  class TestImageScraper < Test::Unit::TestCase
4
7
  should "return list of all image urls on a web page with absolute paths" do
5
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png", "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png", "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2", "http://en.wikipedia.org/images/wikimedia-button.png", "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
6
- assert_equal ImageScraper.image_urls("http://en.wikipedia.org/wiki/Standard_test_image"), images
8
+ images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
9
+ "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
10
+ "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2",
11
+ "http://en.wikipedia.org/images/wikimedia-button.png",
12
+ "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
13
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
14
+ assert_equal images, scraper.image_urls
7
15
  end
8
-
16
+
9
17
  should "return list of all image urls on a web page with relative paths" do
10
- images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png","http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png","http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2","/images/wikimedia-button.png","http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
11
- assert_equal ImageScraper.image_urls("http://en.wikipedia.org/wiki/Standard_test_image",false), images
12
- end
13
- end
18
+ images = ["http://upload.wikimedia.org/wikipedia/en/thumb/2/24/Lenna.png/200px-Lenna.png",
19
+ "http://bits.wikimedia.org/skins-1.17/common/images/magnify-clip.png",
20
+ "http://bits.wikimedia.org/skins-1.17/vector/images/search-ltr.png?301-2",
21
+ "/images/wikimedia-button.png",
22
+ "http://bits.wikimedia.org/skins-1.17/common/images/poweredby_mediawiki_88x31.png"]
23
+ scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
24
+ assert_equal images, scraper.image_urls
25
+ end
26
+
27
+ should "return list of stylesheets contained in html page (relative path)" do
28
+ doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
29
+ domain = "http://test.com"
30
+ assert_equal ["http://test.com/phoenix/testcentral.css"], ImageScraper::Client.new("http://test.com").stylesheets
31
+ end
32
+
33
+ should "return proper absolute url for a page and asset" do
34
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
35
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
36
+ assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","/images/image.gif")
37
+ assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","image.gif")
38
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
39
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","images/image.gif")
40
+ assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
41
+ end
42
+
43
+ should "return images from a stylesheet" do
44
+ scraper = ImageScraper::Client.new("http://local.couponshack.com")
45
+ assert scraper.stylesheet_images.include? ("http://local.couponshack.com/images/bg.png")
46
+ end
47
+ end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 2
9
- version: 0.0.2
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - John McAliley
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-27 00:00:00 -04:00
17
+ date: 2011-06-02 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -31,7 +31,7 @@ dependencies:
31
31
  prerelease: false
32
32
  version_requirements: *id001
33
33
  - !ruby/object:Gem::Dependency
34
- name: shoulda
34
+ name: css_parser
35
35
  requirement: &id002 !ruby/object:Gem::Requirement
36
36
  none: false
37
37
  requirements:
@@ -40,12 +40,38 @@ dependencies:
40
40
  segments:
41
41
  - 0
42
42
  version: "0"
43
- type: :development
43
+ type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: *id002
46
46
  - !ruby/object:Gem::Dependency
47
- name: bundler
47
+ name: rails
48
48
  requirement: &id003 !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: shoulda
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ type: :development
70
+ prerelease: false
71
+ version_requirements: *id004
72
+ - !ruby/object:Gem::Dependency
73
+ name: bundler
74
+ requirement: &id005 !ruby/object:Gem::Requirement
49
75
  none: false
50
76
  requirements:
51
77
  - - ~>
@@ -57,10 +83,10 @@ dependencies:
57
83
  version: 1.0.0
58
84
  type: :development
59
85
  prerelease: false
60
- version_requirements: *id003
86
+ version_requirements: *id005
61
87
  - !ruby/object:Gem::Dependency
62
88
  name: jeweler
63
- requirement: &id004 !ruby/object:Gem::Requirement
89
+ requirement: &id006 !ruby/object:Gem::Requirement
64
90
  none: false
65
91
  requirements:
66
92
  - - ~>
@@ -72,10 +98,10 @@ dependencies:
72
98
  version: 1.5.2
73
99
  type: :development
74
100
  prerelease: false
75
- version_requirements: *id004
101
+ version_requirements: *id006
76
102
  - !ruby/object:Gem::Dependency
77
103
  name: rcov
78
- requirement: &id005 !ruby/object:Gem::Requirement
104
+ requirement: &id007 !ruby/object:Gem::Requirement
79
105
  none: false
80
106
  requirements:
81
107
  - - ">="
@@ -85,10 +111,10 @@ dependencies:
85
111
  version: "0"
86
112
  type: :development
87
113
  prerelease: false
88
- version_requirements: *id005
114
+ version_requirements: *id007
89
115
  - !ruby/object:Gem::Dependency
90
116
  name: nokogiri
91
- requirement: &id006 !ruby/object:Gem::Requirement
117
+ requirement: &id008 !ruby/object:Gem::Requirement
92
118
  none: false
93
119
  requirements:
94
120
  - - ">="
@@ -98,7 +124,20 @@ dependencies:
98
124
  version: "0"
99
125
  type: :runtime
100
126
  prerelease: false
101
- version_requirements: *id006
127
+ version_requirements: *id008
128
+ - !ruby/object:Gem::Dependency
129
+ name: css_parser
130
+ requirement: &id009 !ruby/object:Gem::Requirement
131
+ none: false
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ segments:
136
+ - 0
137
+ version: "0"
138
+ type: :runtime
139
+ prerelease: false
140
+ version_requirements: *id009
102
141
  description: Simple utility to pull image urls from web page
103
142
  email: john.mcaliley@gmail.com
104
143
  executables: []
@@ -119,6 +158,7 @@ files:
119
158
  - image_scraper.gemspec
120
159
  - lib/image_scraper.rb
121
160
  - test/helper.rb
161
+ - test/resources/stylesheet_test.html
122
162
  - test/test_image_scraper.rb
123
163
  has_rdoc: true
124
164
  homepage: http://github.com/charlotte-ruby/image_scraper
@@ -134,7 +174,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
134
174
  requirements:
135
175
  - - ">="
136
176
  - !ruby/object:Gem::Version
137
- hash: -3054709322002535321
177
+ hash: 44385371017236818
138
178
  segments:
139
179
  - 0
140
180
  version: "0"