image_downloader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Malykh Oleg
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = image_downloader
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to image_downloader
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for
18
+ further details.
19
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'image_downloader'
4
+
5
+ downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
6
+
7
+ downloader.parse(:any_looks_like_image => true)
8
+
9
+ downloader.download()
data/bin/download_icon ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'image_downloader'
4
+
5
+ downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
6
+
7
+ downloader.parse(:collect => {:link_icon => true})
8
+
9
+ downloader.download()
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'image_downloader'
4
+
5
+ downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
6
+
7
+ downloader.parse()
8
+
9
+ downloader.download()
@@ -0,0 +1,82 @@
1
+ # gems
2
+ require 'nokogiri'
3
+ # core lib
4
+ require 'open-uri'
5
+ require 'thread'
6
+ require 'uri'
7
+ require 'net/http'
8
+ require 'optparse'
9
+ # local lib
10
+ require 'image_downloader/images'
11
+ require 'image_downloader/url'
12
+ require 'image_downloader/arguments'
13
+ require 'image_downloader/parser'
14
+ require 'image_downloader/download'
15
+
16
+ module ImageDownloader
17
+
18
+ options = {}
19
+ OptionParser.new do |opts|
20
+ opts.on("-d", "--debug", "Run debug mode") do |d|
21
+ options[:debug] = d
22
+ $debug_option = true
23
+ end
24
+ end.parse!
25
+
26
+ class Process
27
+ attr_accessor :argument, :images
28
+
29
+ def initialize(url, path)
30
+ @argument = Arguments.new(url, path)
31
+ @argument.check
32
+ @argument.normalize
33
+ @images = []
34
+ end
35
+
36
+ # :any_looks_like_image => true
37
+ # :ignore_without => {:(extension|image_extension) => true}
38
+ # Nokogiri gem is required:
39
+ # :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true},
40
+ def parse(h={:collect => {}, :ignore_without => {}})
41
+ self.rebuild_collect_hash(h)
42
+
43
+ parser = Parser.new(self.argument.url)
44
+ if h[:any_looks_like_image]
45
+ parser.get_content_raw
46
+ parser.get_images_raw(self.argument.path, h[:collect])
47
+ else
48
+ parser.get_content
49
+ parser.get_images(self.argument.path, h[:collect])
50
+ end
51
+
52
+ parser.ignore_file_without(h[:ignore_without])
53
+
54
+ self.images = parser.images
55
+ end
56
+
57
+ # :(parallel|consequentially)
58
+ def download(*args)
59
+ if !args.first || args.first == :parallel
60
+ Download.parallel(self.images)
61
+ elsif args.first == :consequentially
62
+ Download.consequentially(self.images)
63
+ end
64
+ end
65
+
66
+ protected
67
+
68
+ def rebuild_collect_hash(h={})
69
+ if !h[:collect] || h[:collect].empty? || h[:collect][:all]
70
+ h[:collect] = Parser.all_image_places
71
+ else
72
+ collect_new = {}
73
+ h[:collect].each_key{|k|
74
+ collect_new[(Parser::COLLECT_METHODS_PREFIX + k.to_s).to_sym] = true
75
+ }
76
+ h[:collect].merge!(collect_new)
77
+ h[:collect].delete_if{|k,v| !Parser.all_image_places.has_key?(k)}
78
+ end
79
+ end
80
+
81
+ end
82
+ end
@@ -0,0 +1,27 @@
1
+ module ImageDownloader
2
+ class Arguments
3
+ attr_accessor :url, :path
4
+
5
+ def initialize(url, path)
6
+ @url = url
7
+ @path = path
8
+ end
9
+
10
+ def check
11
+ if !self.url
12
+ p "Not specified url"
13
+ exit
14
+ end
15
+ if !self.path
16
+ p "Not specified path"
17
+ exit
18
+ end
19
+ end
20
+
21
+ def normalize
22
+ self.url = URL.normalize(self.url)
23
+ self.path = self.path.gsub(/\/+$/,'')
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,23 @@
1
+ module ImageDownloader
2
+ class Download
3
+
4
+ def self.parallel(images)
5
+ threads = []
6
+ for image in images
7
+ threads << Thread.new(image) {|local_image|
8
+ p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
9
+ local_image.download
10
+ }
11
+ end
12
+ threads.each { |aThread| aThread.join }
13
+ end
14
+
15
+ def self.consequentially(images)
16
+ for image in images
17
+ p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
18
+ image.download
19
+ end
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,61 @@
1
+ module ImageDownloader
2
+ class Images
3
+ attr_accessor :src, :file_name, :page_host, :absolute_src, :file_path_name
4
+
5
+ MAX_FILE_NAME_LENGTH_ALLOWED = 200
6
+ IMAGE_EXTENSIONS = ["jpg","jpeg","png","gif","ico","svg","bmp"]
7
+ EMPTY_FILE_NAME = 'EMPTY_'
8
+
9
+ def initialize(page_host,src,h = {})
10
+ @page_host = page_host
11
+ @src = src
12
+
13
+ # for fix Errno::ENAMETOOLONG & empty file name
14
+ file_name_suffix = @src.sub(/.*\//,'')
15
+ file_name_suffix = EMPTY_FILE_NAME + rand(100000).to_s if !file_name_suffix || file_name_suffix.empty?
16
+ if file_name_suffix.size > MAX_FILE_NAME_LENGTH_ALLOWED
17
+ file_name_suffix = file_name_suffix[-MAX_FILE_NAME_LENGTH_ALLOWED..file_name_suffix.size]
18
+ end
19
+
20
+ @file_name = h[:file_name_prefix] + file_name_suffix
21
+ @file_path_name = h[:catalog_path] + '/' + @file_name
22
+ @absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' + @src))
23
+ end
24
+
25
+ def download
26
+ url = URI.parse(self.absolute_src)
27
+ request = Net::HTTP::Get.new(url.path)
28
+ Net::HTTP.start(url.host) {|http|
29
+ # for exclude 403 and 404 errors from web servers (e.g. detect current client as script)
30
+ # you can use:
31
+ # - watir (with js support and other ...), but vary vary slow
32
+ # - mechanize (main web client), slow
33
+ # - wget, quick, but cannot support some ability (403, 404 responses)
34
+ # - sockets, independent request, quick, but low-level (many lines of code)
35
+ self.download_by_segment(http,request)
36
+ # self.download_simple(http,request)
37
+ }
38
+ end
39
+
40
+ def download_by_segment(http,request)
41
+ file = open(self.file_path_name, "wb")
42
+ begin
43
+ http.request_get(request.path, "User-Agent"=> "Mozilla/5.0") do |response|
44
+ response.read_body do |segment|
45
+ file.write(segment)
46
+ end
47
+ end
48
+ ensure
49
+ file.close()
50
+ end
51
+ end
52
+
53
+ def download_simple(http,request)
54
+ response = http.get(request.path, "User-Agent"=> "Mozilla/5.0")
55
+ open(self.file_path_name, "wb") { |file|
56
+ file.write(response.body)
57
+ }
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,111 @@
1
+ class Array
2
+ def to_hash_keys(&block)
3
+ Hash[*self.collect { |v|
4
+ [v, block.call(v)]
5
+ }.flatten]
6
+ end
7
+
8
+ def to_hash_values(&block)
9
+ Hash[*self.collect { |v|
10
+ [block.call(v), v]
11
+ }.flatten]
12
+ end
13
+ end
14
+
15
+ module ImageDownloader
16
+ class Parser
17
+ attr_accessor :url, :argument_url, :content, :images, :images_hash
18
+
19
+ A_HREF_IMAGE_PREFIX = '_a_href_'
20
+ STYLE_URL_IMAGE_PREFIX = '_style_url_'
21
+ LINK_ICON_IMAGE_PREFIX = '_link_icon_'
22
+ COLLECT_METHODS_PREFIX = 'collect_from_'
23
+
24
+ def initialize(url)
25
+ @argument_url = url
26
+ @url = URI.parse(url)
27
+ @images = []
28
+ @images_hash = {}
29
+ end
30
+
31
+ def get_content_raw
32
+ @content = open(self.argument_url).read
33
+ @content.gsub!(/[\n\r\t]+/,' ')
34
+ end
35
+
36
+ def get_images_raw(path,h={})
37
+ self.content.scan(/['"]+[^'"]+\.(?:#{Images::IMAGE_EXTENSIONS.join('|')})['"]+/).map{|src|
38
+ src.gsub!(/['"]/,'')
39
+ self.push_to_images(path,src)
40
+ }
41
+ end
42
+
43
+ def get_content
44
+ @content = Nokogiri::HTML(open(self.argument_url))
45
+ end
46
+
47
+ def get_images(path,h={})
48
+ h.each_key{|key| self.send(key, path)}
49
+ end
50
+
51
+ def collect_from_img_src(path)
52
+ self.content.xpath('//img').each do |img|
53
+ src = img[:src]
54
+ URL.remove_new_line_symbols!(src)
55
+ self.push_to_images(path,src)
56
+ end
57
+ end
58
+
59
+ def collect_from_a_href(path)
60
+ self.content.xpath('//a').each do |a|
61
+ href = a[:href]
62
+ URL.remove_new_line_symbols!(href)
63
+ next if href !~ /\.(?:#{Images::IMAGE_EXTENSIONS.join('|')})/i
64
+ self.push_to_images(path,href,{:file_name_prefix => A_HREF_IMAGE_PREFIX})
65
+ end
66
+ end
67
+
68
+ def collect_from_style_url(path)
69
+ self.content.xpath("//*[@style]").each do |element|
70
+ style = element[:style]
71
+ next if style !~ /(?:background|background-image):\s*url\(['"]?(.*?)['"]?\)/i
72
+ src = $1
73
+ next if !src
74
+ URL.remove_new_line_symbols!(src)
75
+ self.push_to_images(path,src,{:file_name_prefix => STYLE_URL_IMAGE_PREFIX})
76
+ end
77
+ end
78
+
79
+ def collect_from_link_icon(path)
80
+ self.content.xpath('//link[@rel="shortcut icon"]').each do |link|
81
+ src = link[:href]
82
+ URL.remove_new_line_symbols!(src)
83
+ self.push_to_images(path,src,{:file_name_prefix => LINK_ICON_IMAGE_PREFIX})
84
+ end
85
+ end
86
+
87
+ def push_to_images(path,src,h={})
88
+ if !self.images_hash.has_key?(src)
89
+ self.images_hash[src] = 1
90
+ self.images.push Images.new(self.url.host,URI.escape(src), {
91
+ :catalog_path => path,
92
+ :file_name_prefix => (h[:file_name_prefix] || '')})
93
+ end
94
+ end
95
+
96
+ def ignore_file_without(h={})
97
+ return if !h
98
+ self.images.delete_if {|image| image.file_name !~ /\.[a-z]{0,5}$/i } if h[:extension]
99
+ self.images.delete_if {|image| image.file_name !~ /\.(?:#{Images::IMAGE_EXTENSIONS.join('|')})$/i } if h[:image_extension]
100
+ end
101
+
102
+ def self.all_collect_from_methods
103
+ Parser.instance_methods.select{|m| m =~ /#{COLLECT_METHODS_PREFIX}/}.map{|m| m.to_sym}.to_hash_keys{true}
104
+ end
105
+
106
+ class << self
107
+ alias all_image_places all_collect_from_methods
108
+ end
109
+
110
+ end
111
+ end
@@ -0,0 +1,18 @@
1
+ module ImageDownloader
2
+ class URL
3
+
4
+ def self.contain_http?(url)
5
+ url =~ /^(http|https)/i ? true : false
6
+ end
7
+
8
+ def self.normalize(url)
9
+ contain_http?(url) ? url : 'http://' + url
10
+ end
11
+
12
+ def self.remove_new_line_symbols!(str)
13
+ str.gsub!(/\r/,'') if str
14
+ str.gsub!(/\n/,'') if str
15
+ end
16
+
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: image_downloader
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Malykh Oleg
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-07-15 00:00:00 +04:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ type: :development
23
+ prerelease: false
24
+ name: shoulda
25
+ version_requirements: &id001 !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ">="
29
+ - !ruby/object:Gem::Version
30
+ hash: 3
31
+ segments:
32
+ - 0
33
+ version: "0"
34
+ requirement: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ type: :development
37
+ prerelease: false
38
+ name: bundler
39
+ version_requirements: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 23
45
+ segments:
46
+ - 1
47
+ - 0
48
+ - 0
49
+ version: 1.0.0
50
+ requirement: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ type: :development
53
+ prerelease: false
54
+ name: jeweler
55
+ version_requirements: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 7
61
+ segments:
62
+ - 1
63
+ - 6
64
+ - 4
65
+ version: 1.6.4
66
+ requirement: *id003
67
+ - !ruby/object:Gem::Dependency
68
+ type: :development
69
+ prerelease: false
70
+ name: rcov
71
+ version_requirements: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirement: *id004
81
+ - !ruby/object:Gem::Dependency
82
+ type: :development
83
+ prerelease: false
84
+ name: nokogiri
85
+ version_requirements: &id005 !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ hash: 15
91
+ segments:
92
+ - 1
93
+ - 4
94
+ - 4
95
+ version: 1.4.4
96
+ requirement: *id005
97
+ description: Detailed description for image-downloader
98
+ email: malykholeg@gmail.com
99
+ executables:
100
+ - download_any_images
101
+ - download_images
102
+ - download_icon
103
+ extensions: []
104
+
105
+ extra_rdoc_files:
106
+ - LICENSE.txt
107
+ - README.rdoc
108
+ files:
109
+ - lib/image_downloader.rb
110
+ - lib/image_downloader/arguments.rb
111
+ - lib/image_downloader/download.rb
112
+ - lib/image_downloader/images.rb
113
+ - lib/image_downloader/parser.rb
114
+ - lib/image_downloader/url.rb
115
+ - LICENSE.txt
116
+ - README.rdoc
117
+ - bin/download_any_images
118
+ - bin/download_images
119
+ - bin/download_icon
120
+ has_rdoc: true
121
+ homepage: http://github.com/Fotom/image_downloader
122
+ licenses:
123
+ - MIT
124
+ post_install_message:
125
+ rdoc_options: []
126
+
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ hash: 3
135
+ segments:
136
+ - 0
137
+ version: "0"
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ hash: 3
144
+ segments:
145
+ - 0
146
+ version: "0"
147
+ requirements: []
148
+
149
+ rubyforge_project:
150
+ rubygems_version: 1.3.7
151
+ signing_key:
152
+ specification_version: 3
153
+ summary: Parsing web page, finding images in specified locations and downloading them simultaneously or sequentially
154
+ test_files: []
155
+