rcarvalho-image_scraper 0.1.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +51 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/image_scraper.gemspec +78 -0
- data/lib/image_scraper/client.rb +62 -0
- data/lib/image_scraper/railtie.rb +7 -0
- data/lib/image_scraper/util.rb +32 -0
- data/lib/image_scraper.rb +7 -0
- data/test/helper.rb +18 -0
- data/test/test_image_scraper.rb +140 -0
- metadata +186 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 73a8c72ad130d838e31a907af63dbbbf4fea23a2
|
4
|
+
data.tar.gz: 875cfa02ec74260cd446e59e00dd50e9100796fc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0d0f8de281e7a86b6d8ce0353c2feff71dc37ac50ee6735fa2a1cfd045dce4037b2470d6103ec720c97089cb1b8d6927358cee122fc47e14f72b6c1b5b457980
|
7
|
+
data.tar.gz: 84e2767fb1d89f94e5ef65f816b2bd93d249b3fca72e0cf641a246a272fcbe6b913beac1265603e4332afab5610f43db81758063b619233ce3023d8680c75764
|
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
gem "nokogiri"
|
7
|
+
gem "css_parser"
|
8
|
+
gem "rails"
|
9
|
+
|
10
|
+
# Add dependencies to develop your gem here.
|
11
|
+
# Include everything needed to run rake, tests, features, etc.
|
12
|
+
group :development do
|
13
|
+
gem "shoulda", ">= 0"
|
14
|
+
gem "bundler", "~> 1.2"
|
15
|
+
gem "jeweler", "~> 1.5"
|
16
|
+
#gem "rcov", ">= 0"
|
17
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 John McAliley
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# image_scraper
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/charlotte-ruby/image_scraper.png?branch=master)](http://travis-ci.org/charlotte-ruby/image_scraper)
|
4
|
+
|
5
|
+
Simple utility that pulls image URLS from web page
|
6
|
+
|
7
|
+
## INSTALL
|
8
|
+
|
9
|
+
Add to your gemfile
|
10
|
+
|
11
|
+
gem "image_scraper"
|
12
|
+
|
13
|
+
Install w/ Bundler
|
14
|
+
|
15
|
+
bundle install
|
16
|
+
|
17
|
+
## USAGE
|
18
|
+
|
19
|
+
Initialize the image scraper client
|
20
|
+
|
21
|
+
image_scraper = ImageScraper::Client.new("http://www.rubygems.org")
|
22
|
+
|
23
|
+
You can also pass an options hash to the client when you initialize it:
|
24
|
+
|
25
|
+
image_scraper = ImageScraper::Client.new("http://www.rubygems.org", options)
|
26
|
+
# OPTIONS - If you don't pass the option, it will default to true
|
27
|
+
# :convert_to_absolute_url - If there are relative image URLS, it will convert them to absolute URLS.
|
28
|
+
# :include_css_images - If there are stylesheets on the page, it will pull images out of the stylesheet. For example: background: url(/images/some-image.png).
|
29
|
+
# :include_css_data_images - Will include data images from CSS. For example: data:image/gif;base64,R0lGODlhEAAOALMAAOazToeH............
|
30
|
+
|
31
|
+
Get the images from the url specified when you initialized the client:
|
32
|
+
|
33
|
+
image_scraper.image_urls
|
34
|
+
|
35
|
+
This will return an array of strings.
|
36
|
+
|
37
|
+
## Contributing to image_scraper
|
38
|
+
|
39
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
40
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
41
|
+
* Fork the project
|
42
|
+
* Start a feature/bugfix branch
|
43
|
+
* Commit and push until you are happy with your contribution
|
44
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
45
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
46
|
+
|
47
|
+
## Copyright
|
48
|
+
|
49
|
+
Copyright (c) 2011 John McAliley. See LICENSE.txt for
|
50
|
+
further details.
|
51
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "image_scraper"
|
16
|
+
gem.homepage = "http://github.com/charlotte-ruby/image_scraper"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{Simple utility to pull image urls from web page}
|
19
|
+
gem.description = %Q{Simple utility to pull image urls from web page}
|
20
|
+
gem.email = "john.mcaliley@gmail.com"
|
21
|
+
gem.authors = ["John McAliley"]
|
22
|
+
gem.add_dependency "nokogiri"
|
23
|
+
gem.add_dependency "css_parser"
|
24
|
+
gem.files.exclude "test/**/*"
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
#require 'rcov/rcovtask'
|
36
|
+
#Rcov::RcovTask.new do |test|
|
37
|
+
# test.libs << 'test'
|
38
|
+
# test.pattern = 'test/**/test_*.rb'
|
39
|
+
# test.verbose = true
|
40
|
+
#end
|
41
|
+
|
42
|
+
task :default => :test
|
43
|
+
|
44
|
+
require 'rdoc/task'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "image_scraper #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.8
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "rcarvalho-image_scraper"
|
8
|
+
s.version = "0.1.8.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["John McAliley"]
|
12
|
+
s.date = "2013-10-23"
|
13
|
+
s.description = "Simple utility to pull image urls from web page"
|
14
|
+
s.email = "john.mcaliley@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
"Gemfile",
|
22
|
+
"LICENSE.txt",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"image_scraper.gemspec",
|
27
|
+
"lib/image_scraper.rb",
|
28
|
+
"lib/image_scraper/client.rb",
|
29
|
+
"lib/image_scraper/railtie.rb",
|
30
|
+
"lib/image_scraper/util.rb"
|
31
|
+
]
|
32
|
+
s.homepage = "http://github.com/charlotte-ruby/image_scraper"
|
33
|
+
s.licenses = ["MIT"]
|
34
|
+
s.require_paths = ["lib"]
|
35
|
+
s.rubygems_version = "1.8.24"
|
36
|
+
s.summary = "Simple utility to pull image urls from web page"
|
37
|
+
s.test_files = [
|
38
|
+
"test/helper.rb",
|
39
|
+
"test/test_image_scraper.rb"
|
40
|
+
]
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
s.specification_version = 3
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
46
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
47
|
+
s.add_runtime_dependency(%q<css_parser>, [">= 0"])
|
48
|
+
s.add_runtime_dependency(%q<rails>, [">= 0"])
|
49
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
50
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
51
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
52
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
53
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
54
|
+
s.add_runtime_dependency(%q<css_parser>, [">= 0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
57
|
+
s.add_dependency(%q<css_parser>, [">= 0"])
|
58
|
+
s.add_dependency(%q<rails>, [">= 0"])
|
59
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
60
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
61
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
62
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
63
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
64
|
+
s.add_dependency(%q<css_parser>, [">= 0"])
|
65
|
+
end
|
66
|
+
else
|
67
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
68
|
+
s.add_dependency(%q<css_parser>, [">= 0"])
|
69
|
+
s.add_dependency(%q<rails>, [">= 0"])
|
70
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
71
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
72
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
73
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
74
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
75
|
+
s.add_dependency(%q<css_parser>, [">= 0"])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
module ImageScraper
|
3
|
+
class Client
|
4
|
+
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
|
5
|
+
|
6
|
+
def initialize(url,options={})
|
7
|
+
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
8
|
+
@url = URI.escape(url)
|
9
|
+
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
10
|
+
@include_css_images = options[:include_css_images]
|
11
|
+
@include_css_data_images = options[:include_css_data_images]
|
12
|
+
html = open(@url).read rescue nil
|
13
|
+
@doc = html ? Nokogiri::HTML(html) : nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def image_urls
|
17
|
+
images = page_images
|
18
|
+
images += stylesheet_images if include_css_images
|
19
|
+
images
|
20
|
+
end
|
21
|
+
|
22
|
+
def page_images
|
23
|
+
urls = []
|
24
|
+
return urls if doc.blank?
|
25
|
+
doc.xpath("//img").each do |img|
|
26
|
+
next if img["src"].blank?
|
27
|
+
image = URI.escape(img["src"].strip)
|
28
|
+
image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
|
29
|
+
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
30
|
+
urls << image
|
31
|
+
end
|
32
|
+
urls
|
33
|
+
end
|
34
|
+
|
35
|
+
def stylesheet_images
|
36
|
+
images = []
|
37
|
+
stylesheets.each do |stylesheet|
|
38
|
+
file = open(stylesheet) rescue next
|
39
|
+
css = file.string rescue IO.read(file) rescue next
|
40
|
+
|
41
|
+
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
42
|
+
image_url = URI.escape image_url[0]
|
43
|
+
image_url = image_url.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
|
44
|
+
if image_url.include?("data:image") and @include_css_data_images
|
45
|
+
image_url
|
46
|
+
else
|
47
|
+
image_url = ImageScraper::Util.strip_quotes(image_url)
|
48
|
+
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
images
|
53
|
+
end
|
54
|
+
|
55
|
+
def stylesheets
|
56
|
+
return [] if doc.blank?
|
57
|
+
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
58
|
+
ImageScraper::Util.absolute_url url, URI.escape(stylesheet['href'])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module ImageScraper
|
2
|
+
module Util
|
3
|
+
def self.absolute_url(url,asset=nil)
|
4
|
+
# TODO - what happens when an index redirect occurs?
|
5
|
+
# Example: 'http://example.com/about' specified as url
|
6
|
+
# 'style.css' specified as asset
|
7
|
+
# url redirects to 'http://example.com/about/'
|
8
|
+
# and serves http://example.com/about/index.html
|
9
|
+
# which then links to the relative asset path 'style.css'
|
10
|
+
# based on original url (http://example.com/about),
|
11
|
+
# self.absolute_url gives
|
12
|
+
# 'http://example.com/style.css
|
13
|
+
# but should get:
|
14
|
+
# 'http://example.com/about/style.css
|
15
|
+
URI.parse(url).merge(URI.parse asset.to_s).to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.domain(url)
|
19
|
+
uri = URI.parse(url)
|
20
|
+
"#{uri.scheme}://#{uri.host}"
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.path(url)
|
24
|
+
uri = URI.parse(url)
|
25
|
+
uri.path
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.strip_quotes(image_url)
|
29
|
+
image_url.sub(/^%22/,'').sub(/%22$/,'').gsub("'","").gsub('"','')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'image_scraper'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'helper'
|
3
|
+
|
4
|
+
|
5
|
+
#TODO: these tests will not work forever. Try to test against a static web page instead of external URLs
|
6
|
+
# Consider using https://raw.github.com/charlotte-ruby/image_scraper urls
|
7
|
+
|
8
|
+
class TestImageScraper < Test::Unit::TestCase
|
9
|
+
should "parse urls even with escaped (%22) double quotes in them" do
|
10
|
+
scraper = ImageScraper::Client.new "http://newscorp.com/careers/"
|
11
|
+
end
|
12
|
+
|
13
|
+
should "return list of all image urls on a web page with absolute paths" do
|
14
|
+
images = ["http://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg",
|
15
|
+
"http://bits.wikimedia.org/static-1.21wmf9/skins/common/images/magnify-clip.png",
|
16
|
+
"http://bits.wikimedia.org/static-1.21wmf9/skins/vector/images/search-ltr.png?303-4",
|
17
|
+
"http://bits.wikimedia.org/images/wikimedia-button.png",
|
18
|
+
"http://bits.wikimedia.org/static-1.21wmf9/skins/common/images/poweredby_mediawiki_88x31.png"]
|
19
|
+
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:include_css_images=>false)
|
20
|
+
|
21
|
+
assert_equal images.size, scraper.image_urls.size
|
22
|
+
|
23
|
+
assert_equal images, scraper.image_urls
|
24
|
+
end
|
25
|
+
|
26
|
+
should "return a list of images with whitespace stripped from the src" do
|
27
|
+
client = ImageScraper::Client.new("http://www.google.com")
|
28
|
+
html = IO.read(File.dirname(__FILE__)+"/resources/extra_whitespace.html")
|
29
|
+
client.doc = Nokogiri::HTML(html)
|
30
|
+
images = ["http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg","http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg"]
|
31
|
+
|
32
|
+
assert_equal images, client.image_urls
|
33
|
+
end
|
34
|
+
|
35
|
+
should "return list of all image urls on a web page with relative paths" do
|
36
|
+
images = ["//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg",
|
37
|
+
"//bits.wikimedia.org/static-1.21wmf9/skins/common/images/magnify-clip.png",
|
38
|
+
"//bits.wikimedia.org/static-1.21wmf9/skins/vector/images/search-ltr.png?303-4",
|
39
|
+
"//bits.wikimedia.org/images/wikimedia-button.png",
|
40
|
+
"//bits.wikimedia.org/static-1.21wmf9/skins/common/images/poweredby_mediawiki_88x31.png"]
|
41
|
+
scraper = ImageScraper::Client.new("http://en.wikipedia.org/wiki/Standard_test_image",:convert_to_absolute_url=>false,:include_css_images=>false)
|
42
|
+
|
43
|
+
assert_equal images.size, scraper.image_urls.size
|
44
|
+
assert_equal images, scraper.image_urls
|
45
|
+
end
|
46
|
+
|
47
|
+
should "return list of stylesheets contained in html page (relative path)" do
|
48
|
+
scraper = ImageScraper::Client.new ""
|
49
|
+
scraper.doc = Nokogiri::HTML(IO.read(File.dirname(__FILE__)+"/resources/stylesheet_test.html"))
|
50
|
+
scraper.url = "http://test.com"
|
51
|
+
|
52
|
+
assert_equal ["http://test.com/css/master.css", "http://test.com/css/master2.css"], scraper.stylesheets
|
53
|
+
end
|
54
|
+
|
55
|
+
should "return proper absolute url for a page and asset" do
|
56
|
+
assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com","image.gif")
|
57
|
+
assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","images/image.gif")
|
58
|
+
assert_equal "http://www.test.com/images/image.gif",ImageScraper::Util.absolute_url("http://www.test.com","/images/image.gif")
|
59
|
+
assert_equal "http://www.test.com/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","image.gif")
|
60
|
+
assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
|
61
|
+
assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","images/image.gif")
|
62
|
+
assert_equal "http://www.test.com/images/image.gif", ImageScraper::Util.absolute_url("http://www.test.com/","/images/image.gif")
|
63
|
+
assert_equal "http://www.test.com/", ImageScraper::Util.absolute_url("http://www.test.com/")
|
64
|
+
assert_equal "http://www.test.com/123/test.html", ImageScraper::Util.absolute_url("http://www.test.com/123/test.html")
|
65
|
+
end
|
66
|
+
|
67
|
+
should "return images from a stylesheet" do
|
68
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
69
|
+
assert scraper.stylesheet_images.include? 'https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'
|
70
|
+
end
|
71
|
+
|
72
|
+
should "strip quotes from a url" do
|
73
|
+
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes("'/images/test.png'")
|
74
|
+
assert_equal "http://www.somsite.com/images/test.png", ImageScraper::Util.strip_quotes("'http://www.somsite.com/images/test.png'")
|
75
|
+
assert_equal "/images/test.png", ImageScraper::Util.strip_quotes('"/images/test.png"')
|
76
|
+
end
|
77
|
+
|
78
|
+
should "return domain section from a url" do
|
79
|
+
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this.html")
|
80
|
+
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what/is/this/")
|
81
|
+
assert_equal "http://ug.ly", ImageScraper::Util.domain("http://ug.ly/what")
|
82
|
+
assert_equal "http://www.ug.ly", ImageScraper::Util.domain("http://www.ug.ly/what/is/this/")
|
83
|
+
end
|
84
|
+
|
85
|
+
should "return nil for doc if URL is invalid" do
|
86
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
87
|
+
assert scraper.doc.nil?
|
88
|
+
end
|
89
|
+
|
90
|
+
should "return empty arrays if URL is invalid" do
|
91
|
+
scraper = ImageScraper::Client.new("couponshack.com")
|
92
|
+
assert_equal [], scraper.image_urls
|
93
|
+
assert_equal [], scraper.stylesheets
|
94
|
+
assert_equal [], scraper.stylesheet_images
|
95
|
+
assert_equal [], scraper.page_images
|
96
|
+
end
|
97
|
+
|
98
|
+
should "Handle a URL with unescaped spaces" do
|
99
|
+
images = ["https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/image1.png"]
|
100
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/space in url.html', :include_css_images => false
|
101
|
+
assert_equal images, scraper.image_urls
|
102
|
+
end
|
103
|
+
|
104
|
+
should "Handle a page image with an unescaped url" do
|
105
|
+
scraper = ImageScraper::Client.new ''
|
106
|
+
scraper.doc = Nokogiri::HTML("<img src='http://test.com/unescaped path'>")
|
107
|
+
assert_equal ['http://test.com/unescaped%20path'], scraper.page_images
|
108
|
+
end
|
109
|
+
|
110
|
+
should "Handle a stylesheet with an unescaped url" do
|
111
|
+
scraper = ImageScraper::Client.new ''
|
112
|
+
scraper.url = 'http://test.com'
|
113
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://test.com/unescaped path.css'>")
|
114
|
+
assert_equal ['http://test.com/unescaped%20path.css'], scraper.stylesheets
|
115
|
+
end
|
116
|
+
|
117
|
+
should "Handle a stylesheet image with an unescaped url" do
|
118
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/stylesheet_unescaped_image.html', :include_css_images => true
|
119
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/some%20image.png'], scraper.stylesheet_images
|
120
|
+
end
|
121
|
+
|
122
|
+
should "Handle a stylesheet image with a relative url" do
|
123
|
+
scraper = ImageScraper::Client.new 'https://raw.github.com/charlotte-ruby/image_scraper/master/test/resources/relative_image_url.html', :include_css_images => true
|
124
|
+
assert_equal ['https://raw.github.com/charlotte-ruby/image_scraper/master/test/images/some_image.png'], scraper.stylesheet_images
|
125
|
+
end
|
126
|
+
|
127
|
+
should "Handle cases where a stylesheet returns a 404" do
|
128
|
+
scraper = ImageScraper::Client.new ''
|
129
|
+
scraper.url = 'http://google.com'
|
130
|
+
scraper.doc = Nokogiri::HTML("<link rel='stylesheet' href='http://google.com/does_not_exist.css'>")
|
131
|
+
assert_equal [], scraper.stylesheet_images
|
132
|
+
end
|
133
|
+
|
134
|
+
should "not crash when it encounters image URLs that include square brackets" do
|
135
|
+
scraper = ImageScraper::Client.new ''
|
136
|
+
scraper.url = 'http://google.com'
|
137
|
+
scraper.doc = Nokogiri::HTML("<img src='image[1].jpg' >")
|
138
|
+
assert_equal ["http://google.com/image%5B1%5D.jpg"], scraper.page_images
|
139
|
+
end
|
140
|
+
end
|
metadata
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rcarvalho-image_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.8.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John McAliley
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: css_parser
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rails
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: shoulda
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.0.0
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.0.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: jeweler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.5.2
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.5.2
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rcov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: nokogiri
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: css_parser
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
description: Simple utility to pull image urls from web page
|
140
|
+
email: john.mcaliley@gmail.com
|
141
|
+
executables: []
|
142
|
+
extensions: []
|
143
|
+
extra_rdoc_files:
|
144
|
+
- LICENSE.txt
|
145
|
+
- README.md
|
146
|
+
files:
|
147
|
+
- .document
|
148
|
+
- Gemfile
|
149
|
+
- LICENSE.txt
|
150
|
+
- README.md
|
151
|
+
- Rakefile
|
152
|
+
- VERSION
|
153
|
+
- image_scraper.gemspec
|
154
|
+
- lib/image_scraper.rb
|
155
|
+
- lib/image_scraper/client.rb
|
156
|
+
- lib/image_scraper/railtie.rb
|
157
|
+
- lib/image_scraper/util.rb
|
158
|
+
- test/helper.rb
|
159
|
+
- test/test_image_scraper.rb
|
160
|
+
homepage: http://github.com/charlotte-ruby/image_scraper
|
161
|
+
licenses:
|
162
|
+
- MIT
|
163
|
+
metadata: {}
|
164
|
+
post_install_message:
|
165
|
+
rdoc_options: []
|
166
|
+
require_paths:
|
167
|
+
- lib
|
168
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - '>='
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
173
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
174
|
+
requirements:
|
175
|
+
- - '>='
|
176
|
+
- !ruby/object:Gem::Version
|
177
|
+
version: '0'
|
178
|
+
requirements: []
|
179
|
+
rubyforge_project:
|
180
|
+
rubygems_version: 2.0.4
|
181
|
+
signing_key:
|
182
|
+
specification_version: 3
|
183
|
+
summary: Simple utility to pull image urls from web page
|
184
|
+
test_files:
|
185
|
+
- test/helper.rb
|
186
|
+
- test/test_image_scraper.rb
|