img_dl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ *.swp
4
+ .bundle
5
+ .config
6
+ .yardoc
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'eventmachine'
4
+ gem 'mechanize'
5
+ gem 'active_support'
6
+ # Specify your gem's dependencies in img_dl.gemspec
7
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,41 @@
1
+ # ImgDl
2
+
3
+ A tool download images.
4
+ Support recursive search, regex, async download, etc.
5
+
6
+ ## Installation
7
+
8
+ $ gem install img_dl
9
+
10
+ ## Usage
11
+
12
+ Usage: img_dl [OPTION]... URL SAVEPATH
13
+
14
+ Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
15
+
16
+ Example: img_dl http://google.com /home/me/download/icons
17
+
18
+ OPTION:
19
+ -r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
20
+ -ul Limit recursive urls count, Only if you specified -r
21
+ -il Limit download images count
22
+ -ur Regex filter for recursive url, Example -ur www.foo.bai/?page=d+
23
+ -ir Regex filter for images,Example -ir .gif$
24
+ -pf Save the file prefix
25
+ -in Interval, default value is 0
26
+ -h print this help
27
+ -version Print version
28
+
29
+ Examples :
30
+
31
+ Download 30 pictures from http://sample.tv/image and save to ./images
32
+ -r mean recursive search and -ur mean only search matched urls
33
+ `$ img_dl -r -il 30 -ur sample.tv/image/page=\d+ http://sample.tv/image ./images`
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'img_dl'
3
+ require 'img_dl/cli'
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'img_dl/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "img_dl"
8
+ gem.version = ImgDl::VERSION
9
+ gem.authors = ["jjy"]
10
+ gem.email = ["jjyruby@gmail.com"]
11
+ gem.description = %q{Download images from web,support regexp and recursive. more args type "img_dl -h"}
12
+ gem.summary = %q{Download images from web,support regexp and recursive,use eventmachine}
13
+ gem.homepage = "http://github.com/jjyr/img_dl"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = %Q{img_dl}
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ end
@@ -0,0 +1,10 @@
1
+ require "img_dl/version"
2
+
3
+ require 'mechanize'
4
+ require 'uri'
5
+ require 'eventmachine'
6
+ require 'em-http'
7
+ require_relative 'img_dl/parser'
8
+
9
+ module ImgDl
10
+ end
@@ -0,0 +1,121 @@
1
+ require 'img_dl'
2
+ require 'fileutils'
3
+ require 'uri'
4
+
5
+ module ImgDl
6
+ module Cli
7
+ CLEAR = case RUBY_PLATFORM
8
+ when /win/i, /ming/i
9
+ "cls"
10
+ else
11
+ "clear"
12
+ end
13
+ HELP = <<-HELP
14
+ Usage: img_dl [OPTION]... URL SAVEPATH
15
+
16
+ Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
17
+
18
+ Example: img_dl http://google.com /home/me/download/icons
19
+
20
+ OPTION:
21
+ -r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
22
+ -ul Limit recursive urls count, Only if you specified -r
23
+ -il Limit download images count
24
+ -ur Regex filter for recursive url, Example -ur www.foo.bai\/\?page=\d+
25
+ -ir Regex filter for images,Example -ir .gif$
26
+ -pf Save the file prefix
27
+ -in Interval, default value is 0
28
+ -h print this help
29
+ -version Print version
30
+ HELP
31
+ NOT_MATCH = "Arguments not match!"
32
+ class << self
33
+ def not_match
34
+ puts NOT_MATCH
35
+ puts HELP
36
+ exit
37
+ end
38
+
39
+ def valid_save_path path
40
+ FileUtils.mkdir_p path
41
+ end
42
+
43
+ def valid_url url
44
+ URI url
45
+ end
46
+
47
+ def parse_to_options args
48
+ case args.size
49
+ when 0,1
50
+ case args.first
51
+ when '-h'
52
+ puts HELP
53
+ when '-version'
54
+ puts VERSION
55
+ else
56
+ not_match
57
+ end
58
+ else
59
+ save_path = args.pop
60
+ url = args.pop
61
+ options = {}
62
+ options[:recursive] = args.delete '-r'
63
+ not_match if args.size.odd?
64
+ args.each_slice(2) do |opt,arg|
65
+ case opt
66
+ when '-ul'
67
+ options[:url_limit_count] = arg.to_i
68
+ when '-il'
69
+ options[:image_limit_count] = arg.to_i
70
+ when '-ur'
71
+ options[:url_reg] = Regexp.new arg
72
+ when '-ir'
73
+ options[:image_reg] = Regexp.new arg
74
+ when '-pf'
75
+ options[:prefix] = arg
76
+ when '-in'
77
+ options[:interval] = arg.to_i
78
+ else
79
+ puts "option '#{opt}' not support! please check out img_dl -h"
80
+ exit
81
+ end
82
+ end
83
+ parser = ImgDl::Parser.new(url,save_path,options)
84
+ Thread.start{parser.start;puts 'All done.';exit}
85
+ parser
86
+ end
87
+ end
88
+
89
+ def prompt parser
90
+ system CLEAR
91
+ puts "url parser status: #{parser.status}"
92
+ puts "downloader status: #{parser.dl_status}"
93
+ puts "recursive urls: #{parser.url_count}"
94
+ puts "images download queue: #{parser.image_count}"
95
+ puts "downloaded images: #{parser.downloaded_image_count}"
96
+ puts "successes: #{parser.success_download}"
97
+ puts "errors: #{parser.error_urls.size}"
98
+ end
99
+
100
+ def run
101
+ parse_to_options ARGV
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ parser = ImgDl::Cli.run
108
+ if parser
109
+ at_exit do
110
+ until parser.error_urls.empty?
111
+ puts parser.error_urls.shift
112
+ puts
113
+ end
114
+ ImgDl::Cli.prompt parser
115
+ end
116
+ $stdout.sync = true
117
+ loop do
118
+ ImgDl::Cli.prompt parser
119
+ sleep 1
120
+ end
121
+ end
@@ -0,0 +1,17 @@
1
+ require 'active_support/core_ext'
2
+
3
+ module ImgDl
4
+ module Helper
5
+ def define_options_helper options
6
+ options.each_key do |k|
7
+ self.class.send :define_method,k do
8
+ options[k]
9
+ end
10
+
11
+ self.class.send :define_method,"#{k}?" do
12
+ options[k].present?
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,201 @@
1
+ require 'thread'
2
+ require 'fileutils'
3
+ require 'active_support/core_ext'
4
+ require 'securerandom'
5
+ require_relative 'helper'
6
+
7
+ module ImgDl
8
+ class Parser
9
+ include Helper
10
+
11
+ Default_Options = {url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0}
12
+
13
+ attr_reader :agent,:origin_url,:options,:image_count,:url_count,:running,:error_urls,:downloaded_image_count,:success_download,:status,:dl_status
14
+ alias running? running
15
+
16
+ def initialize url,save_path,options = {}
17
+ @agent = Mechanize.new
18
+ @agent.user_agent_alias = 'Linux Mozilla'
19
+ @origin_url = URI url
20
+ @current_url = URI url
21
+ @_urls = Hash.new 0
22
+ @_imgs = Hash.new 0
23
+ @save_path = save_path
24
+ FileUtils.mkdir_p save_path
25
+ @image_count = 0
26
+ @url_count = 0
27
+ @urls = Queue.new
28
+ @error_urls = Queue.new
29
+ enq_urls url
30
+ @images = Queue.new
31
+ @options = Default_Options.merge options
32
+ define_options_helper @options
33
+ @downloaded_image_count = 0
34
+ @running = true
35
+ @downloading = true
36
+ @success_download = 0
37
+ @status = "start"
38
+ @dl_status = "ready"
39
+ end
40
+
41
+ def start
42
+ Thread.start{parse}
43
+ download
44
+ end
45
+
46
+ def parse
47
+ loop do
48
+ break unless next_parse?
49
+ sleep interval
50
+ @status = "get url"
51
+ url = @urls.shift
52
+ url = URI.escape url if url.respond_to? :gsub
53
+ @current_url = URI url
54
+ begin
55
+ page = @agent.get url
56
+ rescue StandardError => e
57
+ @error_urls << [url,e]
58
+ puts e
59
+ next
60
+ end
61
+ parse_images page
62
+ if continue?
63
+ parse_links page
64
+ end
65
+ end
66
+ @running = false
67
+ @status = "parser complete"
68
+ end
69
+
70
+ def default_head
71
+ @_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"}
72
+ end
73
+
74
+ def download
75
+ @dl_status = "start"
76
+ @_download_image = 0
77
+ EM.run do
78
+ loop do
79
+ if !running? && (@images.empty? || (image_limit_count && @_download_image >= image_limit_count))
80
+ @dl_status = "all done"
81
+ break
82
+ end
83
+ if @images.empty?
84
+ if running?
85
+ @dl_status = "wait parser"
86
+ sleep 3
87
+ redo
88
+ else
89
+ next
90
+ end
91
+ end
92
+ @_download_image += 1
93
+ @dl_status = "shift image url"
94
+ image_uri = @images.shift
95
+ @dl_status = "download image #{image_uri}"
96
+ http = EventMachine::HttpRequest.new(image_uri).get head: default_head
97
+ http.callback { |res|
98
+ res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/
99
+ type = $1
100
+ if type
101
+ @success_download += 1
102
+ save_image type,res.response
103
+ else
104
+ @error_urls << [image_uri,"image download error"]
105
+ end
106
+ @downloaded_image_count += 1
107
+ @dl_status = "success: download image #{image_uri}"
108
+ download_complete? and EM.stop
109
+ }
110
+ http.errback { |res|
111
+ @error_urls << [image_uri,"image download error"]
112
+ @downloaded_image_count += 1
113
+ @dl_status = "failed: download image #{image_uri}"
114
+ download_complete? and EM.stop
115
+ }
116
+ end
117
+ end
118
+ @dl_status = "download complete"
119
+ @downloading = false
120
+ end
121
+
122
+ protected
123
+ def download_complete?
124
+ !running? && (@images.empty? || (@downloaded_image_count >= @_download_image))
125
+ end
126
+
127
+ def random_file_name
128
+ SecureRandom.uuid
129
+ end
130
+
131
+ def save_image name = random_file_name,type,content
132
+ file_name = File.join @save_path,"#{prefix}#{name}.#{type}"
133
+ File.open(file_name,"w+") do |io|
134
+ io.binmode
135
+ io.write content
136
+ end
137
+ end
138
+
139
+ def valid_url? url
140
+ URI url
141
+ rescue StandardError => e
142
+ @error_urls << [url,e]
143
+ false
144
+ end
145
+
146
+ def enq_urls link
147
+ if !link_dup?(link) && valid_url?(link)
148
+ @_urls[link] += 1
149
+ @urls << link
150
+ @url_count += 1
151
+ end
152
+ end
153
+
154
+ def enq_images src
155
+ if !image_dup?(src) && valid_url?(src)
156
+ @_imgs[src] += 1
157
+ @images << src
158
+ @image_count += 1
159
+ end
160
+ end
161
+
162
+ def link_dup? link
163
+ @_urls.has_key? link
164
+ end
165
+
166
+ def image_dup? src
167
+ @_imgs.has_key? src
168
+ end
169
+
170
+ def valid_link? link
171
+ if url_reg?
172
+ link.to_s =~ url_reg && !link_dup?(link)
173
+ else
174
+ !link_dup?(link)
175
+ end
176
+ end
177
+
178
+ def parse_links page
179
+ @status = "parse urls"
180
+ links = page.links.map{|link| link.href.present? and URI.join @current_url,URI.escape(link.href) rescue nil}
181
+ links.select!{|link| link.present? and valid_link?(link)}
182
+ links.each{|link| enq_urls link}
183
+ end
184
+
185
+ def parse_images page
186
+ @status = "parse images"
187
+ images = page.images.map{|img| img.src.present? and URI.join @current_url,URI.escape(img.src)}
188
+ images.select!{|img| img.to_s =~ img_reg} if image_reg?
189
+ images.each{|img| enq_images img}
190
+ end
191
+
192
+ def continue?
193
+ recursive? && (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true)
194
+ end
195
+
196
+ def next_parse?
197
+ (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) && !@urls.empty?
198
+ end
199
+
200
+ end
201
+ end
@@ -0,0 +1,3 @@
1
+ module ImgDl
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: img_dl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - jjy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Download images from web,support regexp and recursive. more args type
15
+ "img_dl -h"
16
+ email:
17
+ - jjyruby@gmail.com
18
+ executables:
19
+ - img_dl
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - .gitignore
24
+ - Gemfile
25
+ - LICENSE.txt
26
+ - README.md
27
+ - Rakefile
28
+ - bin/img_dl
29
+ - img_dl.gemspec
30
+ - lib/img_dl.rb
31
+ - lib/img_dl/cli.rb
32
+ - lib/img_dl/helper.rb
33
+ - lib/img_dl/parser.rb
34
+ - lib/img_dl/version.rb
35
+ homepage: http://github.com/jjyr/img_dl
36
+ licenses: []
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 1.8.24
56
+ signing_key:
57
+ specification_version: 3
58
+ summary: Download images from web,support regexp and recursive,use eventmachine
59
+ test_files: []