image_scraper 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/image_scraper.gemspec +4 -5
- data/lib/image_scraper.rb +3 -75
- data/lib/image_scraper/client.rb +53 -0
- data/lib/image_scraper/railtie.rb +7 -0
- data/lib/image_scraper/util.rb +20 -0
- metadata +6 -5
- data/Gemfile.lock +0 -86
- data/test/resources/stylesheet_test.html +0 -17
data/Gemfile
CHANGED
@@ -2,9 +2,11 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
5
6
|
gem "nokogiri"
|
6
7
|
gem "css_parser"
|
7
8
|
gem "rails"
|
9
|
+
|
8
10
|
# Add dependencies to develop your gem here.
|
9
11
|
# Include everything needed to run rake, tests, features, etc.
|
10
12
|
group :development do
|
data/Rakefile
CHANGED
@@ -20,7 +20,8 @@ Jeweler::Tasks.new do |gem|
|
|
20
20
|
gem.email = "john.mcaliley@gmail.com"
|
21
21
|
gem.authors = ["John McAliley"]
|
22
22
|
gem.add_dependency "nokogiri"
|
23
|
-
gem.add_dependency "css_parser"
|
23
|
+
gem.add_dependency "css_parser"
|
24
|
+
gem.files.exclude "test/**/*"
|
24
25
|
end
|
25
26
|
Jeweler::RubygemsDotOrgTasks.new
|
26
27
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/image_scraper.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
@@ -19,16 +19,15 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
22
|
"LICENSE.txt",
|
24
23
|
"README.md",
|
25
24
|
"Rakefile",
|
26
25
|
"VERSION",
|
27
26
|
"image_scraper.gemspec",
|
28
27
|
"lib/image_scraper.rb",
|
29
|
-
"
|
30
|
-
"
|
31
|
-
"
|
28
|
+
"lib/image_scraper/client.rb",
|
29
|
+
"lib/image_scraper/railtie.rb",
|
30
|
+
"lib/image_scraper/util.rb"
|
32
31
|
]
|
33
32
|
s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
|
34
33
|
s.licenses = ["MIT"]
|
data/lib/image_scraper.rb
CHANGED
@@ -3,78 +3,6 @@ require 'rails'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def initialize(url,options={})
|
11
|
-
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
12
|
-
@url = url
|
13
|
-
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
14
|
-
@include_css_images = options[:include_css_images]
|
15
|
-
@include_css_data_images = options[:include_css_data_images]
|
16
|
-
@doc = Nokogiri::HTML(open url)
|
17
|
-
end
|
18
|
-
|
19
|
-
def image_urls
|
20
|
-
images = page_images
|
21
|
-
images += stylesheet_images if include_css_images
|
22
|
-
images
|
23
|
-
end
|
24
|
-
|
25
|
-
def page_images
|
26
|
-
urls = []
|
27
|
-
doc.xpath("//img").each do |img|
|
28
|
-
image = img["src"]
|
29
|
-
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
30
|
-
urls << image
|
31
|
-
end
|
32
|
-
urls
|
33
|
-
end
|
34
|
-
|
35
|
-
def stylesheet_images
|
36
|
-
images = []
|
37
|
-
stylesheets.each do |stylesheet|
|
38
|
-
file = open(stylesheet)
|
39
|
-
css = file.string rescue IO.read(file)
|
40
|
-
|
41
|
-
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
42
|
-
if image_url.include?("data:image") and @include_css_data_images
|
43
|
-
image_url[0]
|
44
|
-
else
|
45
|
-
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
images
|
50
|
-
end
|
51
|
-
|
52
|
-
def stylesheets
|
53
|
-
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
54
|
-
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
module Util
|
60
|
-
def self.absolute_url(url,asset=nil)
|
61
|
-
return domain(url) + path(url) if asset.nil? and asset.include("://")
|
62
|
-
return asset if asset.include?("://")
|
63
|
-
return domain(url)+asset if asset[0]=="/"
|
64
|
-
return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.domain(url)
|
68
|
-
uri = URI.parse(url)
|
69
|
-
"#{uri.scheme}://#{uri.host}"
|
70
|
-
end
|
71
|
-
|
72
|
-
def self.path(url)
|
73
|
-
uri = URI.parse(url)
|
74
|
-
uri.path
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
class Railtie < Rails::Railtie
|
79
|
-
end
|
80
|
-
end
|
6
|
+
require 'image_scraper/railtie'
|
7
|
+
require 'image_scraper/util'
|
8
|
+
require 'image_scraper/client'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ImageScraper
|
2
|
+
class Client
|
3
|
+
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
|
4
|
+
|
5
|
+
def initialize(url,options={})
|
6
|
+
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
|
+
@url = url
|
8
|
+
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
9
|
+
@include_css_images = options[:include_css_images]
|
10
|
+
@include_css_data_images = options[:include_css_data_images]
|
11
|
+
@doc = Nokogiri::HTML(open url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def image_urls
|
15
|
+
images = page_images
|
16
|
+
images += stylesheet_images if include_css_images
|
17
|
+
images
|
18
|
+
end
|
19
|
+
|
20
|
+
def page_images
|
21
|
+
urls = []
|
22
|
+
doc.xpath("//img").each do |img|
|
23
|
+
image = img["src"]
|
24
|
+
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
25
|
+
urls << image
|
26
|
+
end
|
27
|
+
urls
|
28
|
+
end
|
29
|
+
|
30
|
+
def stylesheet_images
|
31
|
+
images = []
|
32
|
+
stylesheets.each do |stylesheet|
|
33
|
+
file = open(stylesheet)
|
34
|
+
css = file.string rescue IO.read(file)
|
35
|
+
|
36
|
+
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
37
|
+
if image_url.include?("data:image") and @include_css_data_images
|
38
|
+
image_url[0]
|
39
|
+
else
|
40
|
+
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
images
|
45
|
+
end
|
46
|
+
|
47
|
+
def stylesheets
|
48
|
+
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
49
|
+
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ImageScraper
|
2
|
+
module Util
|
3
|
+
def self.absolute_url(url,asset=nil)
|
4
|
+
return domain(url) + path(url) if asset.nil? and asset.include("://")
|
5
|
+
return asset if asset.include?("://")
|
6
|
+
return domain(url)+asset if asset[0]=="/"
|
7
|
+
return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.domain(url)
|
11
|
+
uri = URI.parse(url)
|
12
|
+
"#{uri.scheme}://#{uri.host}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.path(url)
|
16
|
+
uri = URI.parse(url)
|
17
|
+
uri.path
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John McAliley
|
@@ -150,15 +150,16 @@ extra_rdoc_files:
|
|
150
150
|
files:
|
151
151
|
- .document
|
152
152
|
- Gemfile
|
153
|
-
- Gemfile.lock
|
154
153
|
- LICENSE.txt
|
155
154
|
- README.md
|
156
155
|
- Rakefile
|
157
156
|
- VERSION
|
158
157
|
- image_scraper.gemspec
|
159
158
|
- lib/image_scraper.rb
|
159
|
+
- lib/image_scraper/client.rb
|
160
|
+
- lib/image_scraper/railtie.rb
|
161
|
+
- lib/image_scraper/util.rb
|
160
162
|
- test/helper.rb
|
161
|
-
- test/resources/stylesheet_test.html
|
162
163
|
- test/test_image_scraper.rb
|
163
164
|
has_rdoc: true
|
164
165
|
homepage: http://github.com/charlotte-ruby/image_scraper
|
@@ -174,7 +175,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
174
175
|
requirements:
|
175
176
|
- - ">="
|
176
177
|
- !ruby/object:Gem::Version
|
177
|
-
hash:
|
178
|
+
hash: 1275395306475000737
|
178
179
|
segments:
|
179
180
|
- 0
|
180
181
|
version: "0"
|
data/Gemfile.lock
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
abstract (1.0.0)
|
5
|
-
actionmailer (3.0.7)
|
6
|
-
actionpack (= 3.0.7)
|
7
|
-
mail (~> 2.2.15)
|
8
|
-
actionpack (3.0.7)
|
9
|
-
activemodel (= 3.0.7)
|
10
|
-
activesupport (= 3.0.7)
|
11
|
-
builder (~> 2.1.2)
|
12
|
-
erubis (~> 2.6.6)
|
13
|
-
i18n (~> 0.5.0)
|
14
|
-
rack (~> 1.2.1)
|
15
|
-
rack-mount (~> 0.6.14)
|
16
|
-
rack-test (~> 0.5.7)
|
17
|
-
tzinfo (~> 0.3.23)
|
18
|
-
activemodel (3.0.7)
|
19
|
-
activesupport (= 3.0.7)
|
20
|
-
builder (~> 2.1.2)
|
21
|
-
i18n (~> 0.5.0)
|
22
|
-
activerecord (3.0.7)
|
23
|
-
activemodel (= 3.0.7)
|
24
|
-
activesupport (= 3.0.7)
|
25
|
-
arel (~> 2.0.2)
|
26
|
-
tzinfo (~> 0.3.23)
|
27
|
-
activeresource (3.0.7)
|
28
|
-
activemodel (= 3.0.7)
|
29
|
-
activesupport (= 3.0.7)
|
30
|
-
activesupport (3.0.7)
|
31
|
-
arel (2.0.10)
|
32
|
-
builder (2.1.2)
|
33
|
-
css_parser (1.1.9)
|
34
|
-
erubis (2.6.6)
|
35
|
-
abstract (>= 1.0.0)
|
36
|
-
git (1.2.5)
|
37
|
-
i18n (0.5.0)
|
38
|
-
jeweler (1.5.2)
|
39
|
-
bundler (~> 1.0.0)
|
40
|
-
git (>= 1.2.5)
|
41
|
-
rake
|
42
|
-
mail (2.2.19)
|
43
|
-
activesupport (>= 2.3.6)
|
44
|
-
i18n (>= 0.4.0)
|
45
|
-
mime-types (~> 1.16)
|
46
|
-
treetop (~> 1.4.8)
|
47
|
-
mime-types (1.16)
|
48
|
-
nokogiri (1.4.4)
|
49
|
-
polyglot (0.3.1)
|
50
|
-
rack (1.2.3)
|
51
|
-
rack-mount (0.6.14)
|
52
|
-
rack (>= 1.0.0)
|
53
|
-
rack-test (0.5.7)
|
54
|
-
rack (>= 1.0)
|
55
|
-
rails (3.0.7)
|
56
|
-
actionmailer (= 3.0.7)
|
57
|
-
actionpack (= 3.0.7)
|
58
|
-
activerecord (= 3.0.7)
|
59
|
-
activeresource (= 3.0.7)
|
60
|
-
activesupport (= 3.0.7)
|
61
|
-
bundler (~> 1.0)
|
62
|
-
railties (= 3.0.7)
|
63
|
-
railties (3.0.7)
|
64
|
-
actionpack (= 3.0.7)
|
65
|
-
activesupport (= 3.0.7)
|
66
|
-
rake (>= 0.8.7)
|
67
|
-
thor (~> 0.14.4)
|
68
|
-
rake (0.9.0)
|
69
|
-
rcov (0.9.9)
|
70
|
-
shoulda (2.11.3)
|
71
|
-
thor (0.14.6)
|
72
|
-
treetop (1.4.9)
|
73
|
-
polyglot (>= 0.3.1)
|
74
|
-
tzinfo (0.3.27)
|
75
|
-
|
76
|
-
PLATFORMS
|
77
|
-
ruby
|
78
|
-
|
79
|
-
DEPENDENCIES
|
80
|
-
bundler (~> 1.0.0)
|
81
|
-
css_parser
|
82
|
-
jeweler (~> 1.5.2)
|
83
|
-
nokogiri
|
84
|
-
rails
|
85
|
-
rcov
|
86
|
-
shoulda
|
@@ -1,17 +0,0 @@
|
|
1
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
-
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
-
|
4
|
-
<html lang="en">
|
5
|
-
<head>
|
6
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
-
<title>stylesheet_test</title>
|
8
|
-
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
-
<meta name="author" content="John McAliley">
|
10
|
-
<link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
|
11
|
-
<link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
|
12
|
-
<!-- Date: 2011-06-01 -->
|
13
|
-
</head>
|
14
|
-
<body>
|
15
|
-
|
16
|
-
</body>
|
17
|
-
</html>
|