image_scraper 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -2,9 +2,11 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
+
5
6
  gem "nokogiri"
6
7
  gem "css_parser"
7
8
  gem "rails"
9
+
8
10
  # Add dependencies to develop your gem here.
9
11
  # Include everything needed to run rake, tests, features, etc.
10
12
  group :development do
data/Rakefile CHANGED
@@ -20,7 +20,8 @@ Jeweler::Tasks.new do |gem|
20
20
  gem.email = "john.mcaliley@gmail.com"
21
21
  gem.authors = ["John McAliley"]
22
22
  gem.add_dependency "nokogiri"
23
- gem.add_dependency "css_parser"
23
+ gem.add_dependency "css_parser"
24
+ gem.files.exclude "test/**/*"
24
25
  end
25
26
  Jeweler::RubygemsDotOrgTasks.new
26
27
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{image_scraper}
8
- s.version = "0.1.0"
8
+ s.version = "0.1.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["John McAliley"]
@@ -19,16 +19,15 @@ Gem::Specification.new do |s|
19
19
  s.files = [
20
20
  ".document",
21
21
  "Gemfile",
22
- "Gemfile.lock",
23
22
  "LICENSE.txt",
24
23
  "README.md",
25
24
  "Rakefile",
26
25
  "VERSION",
27
26
  "image_scraper.gemspec",
28
27
  "lib/image_scraper.rb",
29
- "test/helper.rb",
30
- "test/resources/stylesheet_test.html",
31
- "test/test_image_scraper.rb"
28
+ "lib/image_scraper/client.rb",
29
+ "lib/image_scraper/railtie.rb",
30
+ "lib/image_scraper/util.rb"
32
31
  ]
33
32
  s.homepage = %q{http://github.com/charlotte-ruby/image_scraper}
34
33
  s.licenses = ["MIT"]
data/lib/image_scraper.rb CHANGED
@@ -3,78 +3,6 @@ require 'rails'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
5
 
6
- module ImageScraper
7
- class Client
8
- attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
9
-
10
- def initialize(url,options={})
11
- options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
12
- @url = url
13
- @convert_to_absolute_url = options[:convert_to_absolute_url]
14
- @include_css_images = options[:include_css_images]
15
- @include_css_data_images = options[:include_css_data_images]
16
- @doc = Nokogiri::HTML(open url)
17
- end
18
-
19
- def image_urls
20
- images = page_images
21
- images += stylesheet_images if include_css_images
22
- images
23
- end
24
-
25
- def page_images
26
- urls = []
27
- doc.xpath("//img").each do |img|
28
- image = img["src"]
29
- image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
30
- urls << image
31
- end
32
- urls
33
- end
34
-
35
- def stylesheet_images
36
- images = []
37
- stylesheets.each do |stylesheet|
38
- file = open(stylesheet)
39
- css = file.string rescue IO.read(file)
40
-
41
- images += css.scan(/url\((.*?)\)/).collect do |image_url|
42
- if image_url.include?("data:image") and @include_css_data_images
43
- image_url[0]
44
- else
45
- @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
46
- end
47
- end
48
- end
49
- images
50
- end
51
-
52
- def stylesheets
53
- doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
54
- ImageScraper::Util.absolute_url(url,stylesheet['href'])
55
- end
56
- end
57
- end
58
-
59
- module Util
60
- def self.absolute_url(url,asset=nil)
61
- return domain(url) + path(url) if asset.nil? and asset.include("://")
62
- return asset if asset.include?("://")
63
- return domain(url)+asset if asset[0]=="/"
64
- return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
65
- end
66
-
67
- def self.domain(url)
68
- uri = URI.parse(url)
69
- "#{uri.scheme}://#{uri.host}"
70
- end
71
-
72
- def self.path(url)
73
- uri = URI.parse(url)
74
- uri.path
75
- end
76
- end
77
-
78
- class Railtie < Rails::Railtie
79
- end
80
- end
6
+ require 'image_scraper/railtie'
7
+ require 'image_scraper/util'
8
+ require 'image_scraper/client'
@@ -0,0 +1,53 @@
1
+ module ImageScraper
2
+ class Client
3
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
4
+
5
+ def initialize(url,options={})
6
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
7
+ @url = url
8
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
9
+ @include_css_images = options[:include_css_images]
10
+ @include_css_data_images = options[:include_css_data_images]
11
+ @doc = Nokogiri::HTML(open url)
12
+ end
13
+
14
+ def image_urls
15
+ images = page_images
16
+ images += stylesheet_images if include_css_images
17
+ images
18
+ end
19
+
20
+ def page_images
21
+ urls = []
22
+ doc.xpath("//img").each do |img|
23
+ image = img["src"]
24
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
25
+ urls << image
26
+ end
27
+ urls
28
+ end
29
+
30
+ def stylesheet_images
31
+ images = []
32
+ stylesheets.each do |stylesheet|
33
+ file = open(stylesheet)
34
+ css = file.string rescue IO.read(file)
35
+
36
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
37
+ if image_url.include?("data:image") and @include_css_data_images
38
+ image_url[0]
39
+ else
40
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
41
+ end
42
+ end
43
+ end
44
+ images
45
+ end
46
+
47
+ def stylesheets
48
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
49
+ ImageScraper::Util.absolute_url(url,stylesheet['href'])
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,7 @@
1
+ require 'rails'
2
+ require 'image_scraper'
3
+
4
+ module ImageScraper
5
+ class Railtie < Rails::Railtie
6
+ end
7
+ end
@@ -0,0 +1,20 @@
1
+ module ImageScraper
2
+ module Util
3
+ def self.absolute_url(url,asset=nil)
4
+ return domain(url) + path(url) if asset.nil? and asset.include("://")
5
+ return asset if asset.include?("://")
6
+ return domain(url)+asset if asset[0]=="/"
7
+ return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
8
+ end
9
+
10
+ def self.domain(url)
11
+ uri = URI.parse(url)
12
+ "#{uri.scheme}://#{uri.host}"
13
+ end
14
+
15
+ def self.path(url)
16
+ uri = URI.parse(url)
17
+ uri.path
18
+ end
19
+ end
20
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 1
9
+ version: 0.1.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - John McAliley
@@ -150,15 +150,16 @@ extra_rdoc_files:
150
150
  files:
151
151
  - .document
152
152
  - Gemfile
153
- - Gemfile.lock
154
153
  - LICENSE.txt
155
154
  - README.md
156
155
  - Rakefile
157
156
  - VERSION
158
157
  - image_scraper.gemspec
159
158
  - lib/image_scraper.rb
159
+ - lib/image_scraper/client.rb
160
+ - lib/image_scraper/railtie.rb
161
+ - lib/image_scraper/util.rb
160
162
  - test/helper.rb
161
- - test/resources/stylesheet_test.html
162
163
  - test/test_image_scraper.rb
163
164
  has_rdoc: true
164
165
  homepage: http://github.com/charlotte-ruby/image_scraper
@@ -174,7 +175,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
174
175
  requirements:
175
176
  - - ">="
176
177
  - !ruby/object:Gem::Version
177
- hash: 44385371017236818
178
+ hash: 1275395306475000737
178
179
  segments:
179
180
  - 0
180
181
  version: "0"
data/Gemfile.lock DELETED
@@ -1,86 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- abstract (1.0.0)
5
- actionmailer (3.0.7)
6
- actionpack (= 3.0.7)
7
- mail (~> 2.2.15)
8
- actionpack (3.0.7)
9
- activemodel (= 3.0.7)
10
- activesupport (= 3.0.7)
11
- builder (~> 2.1.2)
12
- erubis (~> 2.6.6)
13
- i18n (~> 0.5.0)
14
- rack (~> 1.2.1)
15
- rack-mount (~> 0.6.14)
16
- rack-test (~> 0.5.7)
17
- tzinfo (~> 0.3.23)
18
- activemodel (3.0.7)
19
- activesupport (= 3.0.7)
20
- builder (~> 2.1.2)
21
- i18n (~> 0.5.0)
22
- activerecord (3.0.7)
23
- activemodel (= 3.0.7)
24
- activesupport (= 3.0.7)
25
- arel (~> 2.0.2)
26
- tzinfo (~> 0.3.23)
27
- activeresource (3.0.7)
28
- activemodel (= 3.0.7)
29
- activesupport (= 3.0.7)
30
- activesupport (3.0.7)
31
- arel (2.0.10)
32
- builder (2.1.2)
33
- css_parser (1.1.9)
34
- erubis (2.6.6)
35
- abstract (>= 1.0.0)
36
- git (1.2.5)
37
- i18n (0.5.0)
38
- jeweler (1.5.2)
39
- bundler (~> 1.0.0)
40
- git (>= 1.2.5)
41
- rake
42
- mail (2.2.19)
43
- activesupport (>= 2.3.6)
44
- i18n (>= 0.4.0)
45
- mime-types (~> 1.16)
46
- treetop (~> 1.4.8)
47
- mime-types (1.16)
48
- nokogiri (1.4.4)
49
- polyglot (0.3.1)
50
- rack (1.2.3)
51
- rack-mount (0.6.14)
52
- rack (>= 1.0.0)
53
- rack-test (0.5.7)
54
- rack (>= 1.0)
55
- rails (3.0.7)
56
- actionmailer (= 3.0.7)
57
- actionpack (= 3.0.7)
58
- activerecord (= 3.0.7)
59
- activeresource (= 3.0.7)
60
- activesupport (= 3.0.7)
61
- bundler (~> 1.0)
62
- railties (= 3.0.7)
63
- railties (3.0.7)
64
- actionpack (= 3.0.7)
65
- activesupport (= 3.0.7)
66
- rake (>= 0.8.7)
67
- thor (~> 0.14.4)
68
- rake (0.9.0)
69
- rcov (0.9.9)
70
- shoulda (2.11.3)
71
- thor (0.14.6)
72
- treetop (1.4.9)
73
- polyglot (>= 0.3.1)
74
- tzinfo (0.3.27)
75
-
76
- PLATFORMS
77
- ruby
78
-
79
- DEPENDENCIES
80
- bundler (~> 1.0.0)
81
- css_parser
82
- jeweler (~> 1.5.2)
83
- nokogiri
84
- rails
85
- rcov
86
- shoulda
@@ -1,17 +0,0 @@
1
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
2
- "http://www.w3.org/TR/html4/strict.dtd">
3
-
4
- <html lang="en">
5
- <head>
6
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
- <title>stylesheet_test</title>
8
- <meta name="generator" content="TextMate http://macromates.com/">
9
- <meta name="author" content="John McAliley">
10
- <link rel="stylesheet" href="/css/master.css" type="text/css" media="screen" title="no title" charset="utf-8">
11
- <link rel="stylesheet" href="/css/master2.css" type="text/css" media="screen" title="no title" charset="utf-8">
12
- <!-- Date: 2011-06-01 -->
13
- </head>
14
- <body>
15
-
16
- </body>
17
- </html>