image_scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/image_scraper.gemspec +4 -5
- data/lib/image_scraper.rb +3 -75
- data/lib/image_scraper/client.rb +53 -0
- data/lib/image_scraper/railtie.rb +7 -0
- data/lib/image_scraper/util.rb +20 -0
- metadata +6 -5
- data/Gemfile.lock +0 -86
- data/test/resources/stylesheet_test.html +0 -17
data/Gemfile
CHANGED
@@ -2,9 +2,11 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
5
6
|
gem "nokogiri"
|
6
7
|
gem "css_parser"
|
7
8
|
gem "rails"
|
9
|
+
|
8
10
|
# Add dependencies to develop your gem here.
|
9
11
|
# Include everything needed to run rake, tests, features, etc.
|
10
12
|
group :development do
|
data/Rakefile
CHANGED
@@ -20,7 +20,8 @@ Jeweler::Tasks.new do |gem|
|
|
20
20
|
gem.email = "john.mcaliley@gmail.com"
|
21
21
|
gem.authors = ["John McAliley"]
|
22
22
|
gem.add_dependency "nokogiri"
|
23
|
-
gem.add_dependency "css_parser"
|
23
|
+
gem.add_dependency "css_parser"
|
24
|
+
gem.files.exclude "test/**/*"
|
24
25
|
end
|
25
26
|
Jeweler::RubygemsDotOrgTasks.new
|
26
27
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/image_scraper.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{image_scraper}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["John McAliley"]
|
@@ -19,16 +19,15 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"Gemfile",
|
22
|
-
"Gemfile.lock",
|
23
22
|
"LICENSE.txt",
|
24
23
|
"README.md",
|
25
24
|
"Rakefile",
|
26
25
|
"VERSION",
|
27
26
|
"image_scraper.gemspec",
|
28
27
|
"lib/image_scraper.rb",
|
29
|
-
"
|
30
|
-
"
|
31
|
-
"
|
28
|
+
"lib/image_scraper/client.rb",
|
29
|
+
"lib/image_scraper/railtie.rb",
|
30
|
+
"lib/image_scraper/util.rb"
|
32
31
|
]
|
33
32
|
s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
|
34
33
|
s.licenses = ["MIT"]
|
data/lib/image_scraper.rb
CHANGED
@@ -3,78 +3,6 @@ require 'rails'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def initialize(url,options={})
|
11
|
-
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
12
|
-
@url = url
|
13
|
-
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
14
|
-
@include_css_images = options[:include_css_images]
|
15
|
-
@include_css_data_images = options[:include_css_data_images]
|
16
|
-
@doc = Nokogiri::HTML(open url)
|
17
|
-
end
|
18
|
-
|
19
|
-
def image_urls
|
20
|
-
images = page_images
|
21
|
-
images += stylesheet_images if include_css_images
|
22
|
-
images
|
23
|
-
end
|
24
|
-
|
25
|
-
def page_images
|
26
|
-
urls = []
|
27
|
-
doc.xpath("//img").each do |img|
|
28
|
-
image = img["src"]
|
29
|
-
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
30
|
-
urls << image
|
31
|
-
end
|
32
|
-
urls
|
33
|
-
end
|
34
|
-
|
35
|
-
def stylesheet_images
|
36
|
-
images = []
|
37
|
-
stylesheets.each do |stylesheet|
|
38
|
-
file = open(stylesheet)
|
39
|
-
css = file.string rescue IO.read(file)
|
40
|
-
|
41
|
-
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
42
|
-
if image_url.include?("data:image") and @include_css_data_images
|
43
|
-
image_url[0]
|
44
|
-
else
|
45
|
-
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
images
|
50
|
-
end
|
51
|
-
|
52
|
-
def stylesheets
|
53
|
-
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
54
|
-
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
module Util
|
60
|
-
def self.absolute_url(url,asset=nil)
|
61
|
-
return domain(url) + path(url) if asset.nil? and asset.include("://")
|
62
|
-
return asset if asset.include?("://")
|
63
|
-
return domain(url)+asset if asset[0]=="/"
|
64
|
-
return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.domain(url)
|
68
|
-
uri = URI.parse(url)
|
69
|
-
"#{uri.scheme}://#{uri.host}"
|
70
|
-
end
|
71
|
-
|
72
|
-
def self.path(url)
|
73
|
-
uri = URI.parse(url)
|
74
|
-
uri.path
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
class Railtie < Rails::Railtie
|
79
|
-
end
|
80
|
-
end
|
6
|
+
require 'image_scraper/railtie'
|
7
|
+
require 'image_scraper/util'
|
8
|
+
require 'image_scraper/client'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module ImageScraper
|
2
|
+
class Client
|
3
|
+
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
|
4
|
+
|
5
|
+
def initialize(url,options={})
|
6
|
+
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
|
7
|
+
@url = url
|
8
|
+
@convert_to_absolute_url = options[:convert_to_absolute_url]
|
9
|
+
@include_css_images = options[:include_css_images]
|
10
|
+
@include_css_data_images = options[:include_css_data_images]
|
11
|
+
@doc = Nokogiri::HTML(open url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def image_urls
|
15
|
+
images = page_images
|
16
|
+
images += stylesheet_images if include_css_images
|
17
|
+
images
|
18
|
+
end
|
19
|
+
|
20
|
+
def page_images
|
21
|
+
urls = []
|
22
|
+
doc.xpath("//img").each do |img|
|
23
|
+
image = img["src"]
|
24
|
+
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
|
25
|
+
urls << image
|
26
|
+
end
|
27
|
+
urls
|
28
|
+
end
|
29
|
+
|
30
|
+
def stylesheet_images
|
31
|
+
images = []
|
32
|
+
stylesheets.each do |stylesheet|
|
33
|
+
file = open(stylesheet)
|
34
|
+
css = file.string rescue IO.read(file)
|
35
|
+
|
36
|
+
images += css.scan(/url\((.*?)\)/).collect do |image_url|
|
37
|
+
if image_url.include?("data:image") and @include_css_data_images
|
38
|
+
image_url[0]
|
39
|
+
else
|
40
|
+
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
images
|
45
|
+
end
|
46
|
+
|
47
|
+
def stylesheets
|
48
|
+
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
|
49
|
+
ImageScraper::Util.absolute_url(url,stylesheet['href'])
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ImageScraper
|
2
|
+
module Util
|
3
|
+
def self.absolute_url(url,asset=nil)
|
4
|
+
return domain(url) + path(url) if asset.nil? and asset.include("://")
|
5
|
+
return asset if asset.include?("://")
|
6
|
+
return domain(url)+asset if asset[0]=="/"
|
7
|
+
return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.domain(url)
|
11
|
+
uri = URI.parse(url)
|
12
|
+
"#{uri.scheme}://#{uri.host}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.path(url)
|
16
|
+
uri = URI.parse(url)
|
17
|
+
uri.path
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John McAliley
|
@@ -150,15 +150,16 @@ extra_rdoc_files:
|
|
150
150
|
files:
|
151
151
|
- .document
|
152
152
|
- Gemfile
|
153
|
-
- Gemfile.lock
|
154
153
|
- LICENSE.txt
|
155
154
|
- README.md
|
156
155
|
- Rakefile
|
157
156
|
- VERSION
|
158
157
|
- image_scraper.gemspec
|
159
158
|
- lib/image_scraper.rb
|
159
|
+
- lib/image_scraper/client.rb
|
160
|
+
- lib/image_scraper/railtie.rb
|
161
|
+
- lib/image_scraper/util.rb
|
160
162
|
- test/helper.rb
|
161
|
-
- test/resources/stylesheet_test.html
|
162
163
|
- test/test_image_scraper.rb
|
163
164
|
has_rdoc: true
|
164
165
|
homepage: http://github.com/charlotte-ruby/image_scraper
|
@@ -174,7 +175,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
174
175
|
requirements:
|
175
176
|
- - ">="
|
176
177
|
- !ruby/object:Gem::Version
|
177
|
-
hash:
|
178
|
+
hash: 1275395306475000737
|
178
179
|
segments:
|
179
180
|
- 0
|
180
181
|
version: "0"
|
data/Gemfile.lock
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: http://rubygems.org/
|
3
|
-
specs:
|
4
|
-
abstract (1.0.0)
|
5
|
-
actionmailer (3.0.7)
|
6
|
-
actionpack (= 3.0.7)
|
7
|
-
mail (~> 2.2.15)
|
8
|
-
actionpack (3.0.7)
|
9
|
-
activemodel (= 3.0.7)
|
10
|
-
activesupport (= 3.0.7)
|
11
|
-
builder (~> 2.1.2)
|
12
|
-
erubis (~> 2.6.6)
|
13
|
-
i18n (~> 0.5.0)
|
14
|
-
rack (~> 1.2.1)
|
15
|
-
rack-mount (~> 0.6.14)
|
16
|
-
rack-test (~> 0.5.7)
|
17
|
-
tzinfo (~> 0.3.23)
|
18
|
-
activemodel (3.0.7)
|
19
|
-
activesupport (= 3.0.7)
|
20
|
-
builder (~> 2.1.2)
|
21
|
-
i18n (~> 0.5.0)
|
22
|
-
activerecord (3.0.7)
|
23
|
-
activemodel (= 3.0.7)
|
24
|
-
activesupport (= 3.0.7)
|
25
|
-
arel (~> 2.0.2)
|
26
|
-
tzinfo (~> 0.3.23)
|
27
|
-
activeresource (3.0.7)
|
28
|
-
activemodel (= 3.0.7)
|
29
|
-
activesupport (= 3.0.7)
|
30
|
-
activesupport (3.0.7)
|
31
|
-
arel (2.0.10)
|
32
|
-
builder (2.1.2)
|
33
|
-
css_parser (1.1.9)
|
34
|
-
erubis (2.6.6)
|
35
|
-
abstract (>= 1.0.0)
|
36
|
-
git (1.2.5)
|
37
|
-
i18n (0.5.0)
|
38
|
-
jeweler (1.5.2)
|
39
|
-
bundler (~> 1.0.0)
|
40
|
-
git (>= 1.2.5)
|
41
|
-
rake
|
42
|
-
mail (2.2.19)
|
43
|
-
activesupport (>= 2.3.6)
|
44
|
-
i18n (>= 0.4.0)
|
45
|
-
mime-types (~> 1.16)
|
46
|
-
treetop (~> 1.4.8)
|
47
|
-
mime-types (1.16)
|
48
|
-
nokogiri (1.4.4)
|
49
|
-
polyglot (0.3.1)
|
50
|
-
rack (1.2.3)
|
51
|
-
rack-mount (0.6.14)
|
52
|
-
rack (>= 1.0.0)
|
53
|
-
rack-test (0.5.7)
|
54
|
-
rack (>= 1.0)
|
55
|
-
rails (3.0.7)
|
56
|
-
actionmailer (= 3.0.7)
|
57
|
-
actionpack (= 3.0.7)
|
58
|
-
activerecord (= 3.0.7)
|
59
|
-
activeresource (= 3.0.7)
|
60
|
-
activesupport (= 3.0.7)
|
61
|
-
bundler (~> 1.0)
|
62
|
-
railties (= 3.0.7)
|
63
|
-
railties (3.0.7)
|
64
|
-
actionpack (= 3.0.7)
|
65
|
-
activesupport (= 3.0.7)
|
66
|
-
rake (>= 0.8.7)
|
67
|
-
thor (~> 0.14.4)
|
68
|
-
rake (0.9.0)
|
69
|
-
rcov (0.9.9)
|
70
|
-
shoulda (2.11.3)
|
71
|
-
thor (0.14.6)
|
72
|
-
treetop (1.4.9)
|
73
|
-
polyglot (>= 0.3.1)
|
74
|
-
tzinfo (0.3.27)
|
75
|
-
|
76
|
-
PLATFORMS
|
77
|
-
ruby
|
78
|
-
|
79
|
-
DEPENDENCIES
|
80
|
-
bundler (~> 1.0.0)
|
81
|
-
css_parser
|
82
|
-
jeweler (~> 1.5.2)
|
83
|
-
nokogiri
|
84
|
-
rails
|
85
|
-
rcov
|
86
|
-
shoulda
|
@@ -1,17 +0,0 @@
|
|
1
|
-
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
2
|
-
"http://www.w3.org/TR/html4/strict.dtd">
|
3
|
-
|
4
|
-
<html lang="en">
|
5
|
-
<head>
|
6
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
7
|
-
<title>stylesheet_test</title>
|
8
|
-
<meta name="generator" content="TextMate http://macromates.com/">
|
9
|
-
<meta name="author" content="John McAliley">
|
10
|
-
<link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
|
11
|
-
<link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
|
12
|
-
<!-- Date: 2011-06-01 -->
|
13
|
-
</head>
|
14
|
-
<body>
|
15
|
-
|
16
|
-
</body>
|
17
|
-
</html>
|