image_scraper 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -2,9 +2,11 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
+
5
6
  gem "nokogiri"
6
7
  gem "css_parser"
7
8
  gem "rails"
9
+
8
10
  # Add dependencies to develop your gem here.
9
11
  # Include everything needed to run rake, tests, features, etc.
10
12
  group :development do
data/Rakefile CHANGED
@@ -20,7 +20,8 @@ Jeweler::Tasks.new do |gem|
20
20
  gem.email = "john.mcaliley@gmail.com"
21
21
  gem.authors = ["John McAliley"]
22
22
  gem.add_dependency "nokogiri"
23
- gem.add_dependency "css_parser"
23
+ gem.add_dependency "css_parser"
24
+ gem.files.exclude "test/**/*"
24
25
  end
25
26
  Jeweler::RubygemsDotOrgTasks.new
26
27
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.0"
8
+ s.version = "0.1.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
@@ -19,16 +19,15 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  "Gemfile",
22
- "Gemfile.lock",
23
22
  "LICENSE.txt",
24
23
  "README.md",
25
24
  "Rakefile",
26
25
  "VERSION",
27
26
  "image_scraper.gemspec",
28
27
  "lib/image_scraper.rb",
29
- "test/helper.rb",
30
- "test/resources/stylesheet_test.html",
31
- "test/test_image_scraper.rb"
28
+ "lib/image_scraper/client.rb",
29
+ "lib/image_scraper/railtie.rb",
30
+ "lib/image_scraper/util.rb"
32
31
  ]
33
32
  s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
34
33
  s.licenses = ["MIT"]
data/lib/image_scraper.rb CHANGED
@@ -3,78 +3,6 @@ require 'rails'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
5
 
6
- module ImageScraper
7
- class Client
8
- attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
9
-
10
- def initialize(url,options={})
11
- options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
12
- @url = url
13
- @convert_to_absolute_url = options[:convert_to_absolute_url]
14
- @include_css_images = options[:include_css_images]
15
- @include_css_data_images = options[:include_css_data_images]
16
- @doc = Nokogiri::HTML(open url)
17
- end
18
-
19
- def image_urls
20
- images = page_images
21
- images += stylesheet_images if include_css_images
22
- images
23
- end
24
-
25
- def page_images
26
- urls = []
27
- doc.xpath("//img").each do |img|
28
- image = img["src"]
29
- image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
30
- urls << image
31
- end
32
- urls
33
- end
34
-
35
- def stylesheet_images
36
- images = []
37
- stylesheets.each do |stylesheet|
38
- file = open(stylesheet)
39
- css = file.string rescue IO.read(file)
40
-
41
- images += css.scan(/url\((.*?)\)/).collect do |image_url|
42
- if image_url.include?("data:image") and @include_css_data_images
43
- image_url[0]
44
- else
45
- @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
46
- end
47
- end
48
- end
49
- images
50
- end
51
-
52
- def stylesheets
53
- doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
- ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
- end
56
- end
57
- end
58
-
59
- module Util
60
- def self.absolute_url(url,asset=nil)
61
- return domain(url) + path(url) if asset.nil? and asset.include("://")
62
- return asset if asset.include?("://")
63
- return domain(url)+asset if asset[0]=="/"
64
- return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
65
- end
66
-
67
- def self.domain(url)
68
- uri = URI.parse(url)
69
- "#{uri.scheme}://#{uri.host}"
70
- end
71
-
72
- def self.path(url)
73
- uri = URI.parse(url)
74
- uri.path
75
- end
76
- end
77
-
78
- class Railtie < Rails::Railtie
79
- end
80
- end
6
+ require 'image_scraper/railtie'
7
+ require 'image_scraper/util'
8
+ require 'image_scraper/client'
@@ -0,0 +1,53 @@
1
+ module ImageScraper
2
+ class Client
3
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
4
+
5
+ def initialize(url,options={})
6
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
+ @url = url
8
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
9
+ @include_css_images = options[:include_css_images]
10
+ @include_css_data_images = options[:include_css_data_images]
11
+ @doc = Nokogiri::HTML(open url)
12
+ end
13
+
14
+ def image_urls
15
+ images = page_images
16
+ images += stylesheet_images if include_css_images
17
+ images
18
+ end
19
+
20
+ def page_images
21
+ urls = []
22
+ doc.xpath("//img").each do |img|
23
+ image = img["src"]
24
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
25
+ urls << image
26
+ end
27
+ urls
28
+ end
29
+
30
+ def stylesheet_images
31
+ images = []
32
+ stylesheets.each do |stylesheet|
33
+ file = open(stylesheet)
34
+ css = file.string rescue IO.read(file)
35
+
36
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
37
+ if image_url.include?("data:image") and @include_css_data_images
38
+ image_url[0]
39
+ else
40
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
41
+ end
42
+ end
43
+ end
44
+ images
45
+ end
46
+
47
+ def stylesheets
48
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
49
+ ImageScraper::Util.absolute_url(url,stylesheet['href'])
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,7 @@
1
+ require 'rails'
2
+ require 'image_scraper'
3
+
4
+ module ImageScraper
5
+ class Railtie < Rails::Railtie
6
+ end
7
+ end
@@ -0,0 +1,20 @@
1
+ module ImageScraper
2
+ module Util
3
+ def self.absolute_url(url,asset=nil)
4
+ return domain(url) + path(url) if asset.nil? and asset.include("://")
5
+ return asset if asset.include?("://")
6
+ return domain(url)+asset if asset[0]=="/"
7
+ return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
8
+ end
9
+
10
+ def self.domain(url)
11
+ uri = URI.parse(url)
12
+ "#{uri.scheme}://#{uri.host}"
13
+ end
14
+
15
+ def self.path(url)
16
+ uri = URI.parse(url)
17
+ uri.path
18
+ end
19
+ end
20
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 1
9
+ version: 0.1.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - John McAliley
@@ -150,15 +150,16 @@ extra_rdoc_files:
150
150
  files:
151
151
  - .document
152
152
  - Gemfile
153
- - Gemfile.lock
154
153
  - LICENSE.txt
155
154
  - README.md
156
155
  - Rakefile
157
156
  - VERSION
158
157
  - image_scraper.gemspec
159
158
  - lib/image_scraper.rb
159
+ - lib/image_scraper/client.rb
160
+ - lib/image_scraper/railtie.rb
161
+ - lib/image_scraper/util.rb
160
162
  - test/helper.rb
161
- - test/resources/stylesheet_test.html
162
163
  - test/test_image_scraper.rb
163
164
  has_rdoc: true
164
165
  homepage: http://github.com/charlotte-ruby/image_scraper
@@ -174,7 +175,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
174
175
  requirements:
175
176
  - - ">="
176
177
  - !ruby/object:Gem::Version
177
- hash: 44385371017236818
178
+ hash: 1275395306475000737
178
179
  segments:
179
180
  - 0
180
181
  version: "0"
data/Gemfile.lock DELETED
@@ -1,86 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- abstract (1.0.0)
5
- actionmailer (3.0.7)
6
- actionpack (= 3.0.7)
7
- mail (~> 2.2.15)
8
- actionpack (3.0.7)
9
- activemodel (= 3.0.7)
10
- activesupport (= 3.0.7)
11
- builder (~> 2.1.2)
12
- erubis (~> 2.6.6)
13
- i18n (~> 0.5.0)
14
- rack (~> 1.2.1)
15
- rack-mount (~> 0.6.14)
16
- rack-test (~> 0.5.7)
17
- tzinfo (~> 0.3.23)
18
- activemodel (3.0.7)
19
- activesupport (= 3.0.7)
20
- builder (~> 2.1.2)
21
- i18n (~> 0.5.0)
22
- activerecord (3.0.7)
23
- activemodel (= 3.0.7)
24
- activesupport (= 3.0.7)
25
- arel (~> 2.0.2)
26
- tzinfo (~> 0.3.23)
27
- activeresource (3.0.7)
28
- activemodel (= 3.0.7)
29
- activesupport (= 3.0.7)
30
- activesupport (3.0.7)
31
- arel (2.0.10)
32
- builder (2.1.2)
33
- css_parser (1.1.9)
34
- erubis (2.6.6)
35
- abstract (>= 1.0.0)
36
- git (1.2.5)
37
- i18n (0.5.0)
38
- jeweler (1.5.2)
39
- bundler (~> 1.0.0)
40
- git (>= 1.2.5)
41
- rake
42
- mail (2.2.19)
43
- activesupport (>= 2.3.6)
44
- i18n (>= 0.4.0)
45
- mime-types (~> 1.16)
46
- treetop (~> 1.4.8)
47
- mime-types (1.16)
48
- nokogiri (1.4.4)
49
- polyglot (0.3.1)
50
- rack (1.2.3)
51
- rack-mount (0.6.14)
52
- rack (>= 1.0.0)
53
- rack-test (0.5.7)
54
- rack (>= 1.0)
55
- rails (3.0.7)
56
- actionmailer (= 3.0.7)
57
- actionpack (= 3.0.7)
58
- activerecord (= 3.0.7)
59
- activeresource (= 3.0.7)
60
- activesupport (= 3.0.7)
61
- bundler (~> 1.0)
62
- railties (= 3.0.7)
63
- railties (3.0.7)
64
- actionpack (= 3.0.7)
65
- activesupport (= 3.0.7)
66
- rake (>= 0.8.7)
67
- thor (~> 0.14.4)
68
- rake (0.9.0)
69
- rcov (0.9.9)
70
- shoulda (2.11.3)
71
- thor (0.14.6)
72
- treetop (1.4.9)
73
- polyglot (>= 0.3.1)
74
- tzinfo (0.3.27)
75
-
76
- PLATFORMS
77
- ruby
78
-
79
- DEPENDENCIES
80
- bundler (~> 1.0.0)
81
- css_parser
82
- jeweler (~> 1.5.2)
83
- nokogiri
84
- rails
85
- rcov
86
- shoulda
@@ -1,17 +0,0 @@
1
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
- "http://www.w3.org/TR/html4/strict.dtd">
3
-
4
- <html lang="en">
5
- <head>
6
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
- <title>stylesheet_test</title>
8
- <meta name="generator" content="TextMate http://macromates.com/">
9
- <meta name="author" content="John McAliley">
10
- <link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
11
- <link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
12
- <!-- Date: 2011-06-01 -->
13
- </head>
14
- <body>
15
-
16
- </body>
17
- </html>