img_dl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +41 -0
- data/Rakefile +1 -0
- data/bin/img_dl +3 -0
- data/img_dl.gemspec +19 -0
- data/lib/img_dl.rb +10 -0
- data/lib/img_dl/cli.rb +121 -0
- data/lib/img_dl/helper.rb +17 -0
- data/lib/img_dl/parser.rb +201 -0
- data/lib/img_dl/version.rb +3 -0
- metadata +59 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 TODO: Write your name
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# ImgDl
|
2
|
+
|
3
|
+
A tool download images.
|
4
|
+
Support recursive search, regex, async download, etc.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
$ gem install img_dl
|
9
|
+
|
10
|
+
## Usage
|
11
|
+
|
12
|
+
Usage: img_dl [OPTION]... URL SAVEPATH
|
13
|
+
|
14
|
+
Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
|
15
|
+
|
16
|
+
Example: img_dl http://google.com /home/me/download/icons
|
17
|
+
|
18
|
+
OPTION:
|
19
|
+
-r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
|
20
|
+
-ul Limit recursive urls count, Only if you specified -r
|
21
|
+
-il Limit download images count
|
22
|
+
-ur Regex filter for recursive url, Example -ur www.foo.bai/?page=d+
|
23
|
+
-ir Regex filter for images,Example -ir .gif$
|
24
|
+
-pf Save the file prefix
|
25
|
+
-in Interval, default value is 0
|
26
|
+
-h print this help
|
27
|
+
-version Print version
|
28
|
+
|
29
|
+
Examples :
|
30
|
+
|
31
|
+
Download 30 pictures from http://sample.tv/image and save to ./images
|
32
|
+
-r mean recursive search and -ur mean only search matched urls
|
33
|
+
`$ img_dl -r -il 30 -ur sample.tv/image/page=\d+ http://sample.tv/image ./images`
|
34
|
+
|
35
|
+
## Contributing
|
36
|
+
|
37
|
+
1. Fork it
|
38
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
39
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
40
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
41
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/img_dl
ADDED
data/img_dl.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'img_dl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "img_dl"
|
8
|
+
gem.version = ImgDl::VERSION
|
9
|
+
gem.authors = ["jjy"]
|
10
|
+
gem.email = ["jjyruby@gmail.com"]
|
11
|
+
gem.description = %q{Download images from web,support regexp and recursive. more args type "img_dl -h"}
|
12
|
+
gem.summary = %q{Download images from web,support regexp and recursive,use eventmachine}
|
13
|
+
gem.homepage = "http://github.com/jjyr/img_dl"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = %Q{img_dl}
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
end
|
data/lib/img_dl.rb
ADDED
data/lib/img_dl/cli.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'img_dl'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
module ImgDl
|
6
|
+
module Cli
|
7
|
+
CLEAR = case RUBY_PLATFORM
|
8
|
+
when /win/i, /ming/i
|
9
|
+
"cls"
|
10
|
+
else
|
11
|
+
"clear"
|
12
|
+
end
|
13
|
+
HELP = <<-HELP
|
14
|
+
Usage: img_dl [OPTION]... URL SAVEPATH
|
15
|
+
|
16
|
+
Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
|
17
|
+
|
18
|
+
Example: img_dl http://google.com /home/me/download/icons
|
19
|
+
|
20
|
+
OPTION:
|
21
|
+
-r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
|
22
|
+
-ul Limit recursive urls count, Only if you specified -r
|
23
|
+
-il Limit download images count
|
24
|
+
-ur Regex filter for recursive url, Example -ur www.foo.bai\/\?page=\d+
|
25
|
+
-ir Regex filter for images,Example -ir .gif$
|
26
|
+
-pf Save the file prefix
|
27
|
+
-in Interval, default value is 0
|
28
|
+
-h print this help
|
29
|
+
-version Print version
|
30
|
+
HELP
|
31
|
+
NOT_MATCH = "Arguments not match!"
|
32
|
+
class << self
|
33
|
+
def not_match
|
34
|
+
puts NOT_MATCH
|
35
|
+
puts HELP
|
36
|
+
exit
|
37
|
+
end
|
38
|
+
|
39
|
+
def valid_save_path path
|
40
|
+
FileUtils.mkdir_p path
|
41
|
+
end
|
42
|
+
|
43
|
+
def valid_url url
|
44
|
+
URI url
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_to_options args
|
48
|
+
case args.size
|
49
|
+
when 0,1
|
50
|
+
case args.first
|
51
|
+
when '-h'
|
52
|
+
puts HELP
|
53
|
+
when '-version'
|
54
|
+
puts VERSION
|
55
|
+
else
|
56
|
+
not_match
|
57
|
+
end
|
58
|
+
else
|
59
|
+
save_path = args.pop
|
60
|
+
url = args.pop
|
61
|
+
options = {}
|
62
|
+
options[:recursive] = args.delete '-r'
|
63
|
+
not_match if args.size.odd?
|
64
|
+
args.each_slice(2) do |opt,arg|
|
65
|
+
case opt
|
66
|
+
when '-ul'
|
67
|
+
options[:url_limit_count] = arg.to_i
|
68
|
+
when '-il'
|
69
|
+
options[:image_limit_count] = arg.to_i
|
70
|
+
when '-ur'
|
71
|
+
options[:url_reg] = Regexp.new arg
|
72
|
+
when '-ir'
|
73
|
+
options[:image_reg] = Regexp.new arg
|
74
|
+
when '-pf'
|
75
|
+
options[:prefix] = arg
|
76
|
+
when '-in'
|
77
|
+
options[:interval] = arg.to_i
|
78
|
+
else
|
79
|
+
puts "option '#{opt}' not support! please check out img_dl -h"
|
80
|
+
exit
|
81
|
+
end
|
82
|
+
end
|
83
|
+
parser = ImgDl::Parser.new(url,save_path,options)
|
84
|
+
Thread.start{parser.start;puts 'All done.';exit}
|
85
|
+
parser
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def prompt parser
|
90
|
+
system CLEAR
|
91
|
+
puts "url parser status: #{parser.status}"
|
92
|
+
puts "downloader status: #{parser.dl_status}"
|
93
|
+
puts "recursive urls: #{parser.url_count}"
|
94
|
+
puts "images download queue: #{parser.image_count}"
|
95
|
+
puts "downloaded images: #{parser.downloaded_image_count}"
|
96
|
+
puts "successes: #{parser.success_download}"
|
97
|
+
puts "errors: #{parser.error_urls.size}"
|
98
|
+
end
|
99
|
+
|
100
|
+
def run
|
101
|
+
parse_to_options ARGV
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
parser = ImgDl::Cli.run
|
108
|
+
if parser
|
109
|
+
at_exit do
|
110
|
+
until parser.error_urls.empty?
|
111
|
+
puts parser.error_urls.shift
|
112
|
+
puts
|
113
|
+
end
|
114
|
+
ImgDl::Cli.prompt parser
|
115
|
+
end
|
116
|
+
$stdout.sync = true
|
117
|
+
loop do
|
118
|
+
ImgDl::Cli.prompt parser
|
119
|
+
sleep 1
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'active_support/core_ext'
|
2
|
+
|
3
|
+
module ImgDl
|
4
|
+
module Helper
|
5
|
+
def define_options_helper options
|
6
|
+
options.each_key do |k|
|
7
|
+
self.class.send :define_method,k do
|
8
|
+
options[k]
|
9
|
+
end
|
10
|
+
|
11
|
+
self.class.send :define_method,"#{k}?" do
|
12
|
+
options[k].present?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'active_support/core_ext'
|
4
|
+
require 'securerandom'
|
5
|
+
require_relative 'helper'
|
6
|
+
|
7
|
+
module ImgDl
|
8
|
+
class Parser
|
9
|
+
include Helper
|
10
|
+
|
11
|
+
Default_Options = {url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0}
|
12
|
+
|
13
|
+
attr_reader :agent,:origin_url,:options,:image_count,:url_count,:running,:error_urls,:downloaded_image_count,:success_download,:status,:dl_status
|
14
|
+
alias running? running
|
15
|
+
|
16
|
+
def initialize url,save_path,options = {}
|
17
|
+
@agent = Mechanize.new
|
18
|
+
@agent.user_agent_alias = 'Linux Mozilla'
|
19
|
+
@origin_url = URI url
|
20
|
+
@current_url = URI url
|
21
|
+
@_urls = Hash.new 0
|
22
|
+
@_imgs = Hash.new 0
|
23
|
+
@save_path = save_path
|
24
|
+
FileUtils.mkdir_p save_path
|
25
|
+
@image_count = 0
|
26
|
+
@url_count = 0
|
27
|
+
@urls = Queue.new
|
28
|
+
@error_urls = Queue.new
|
29
|
+
enq_urls url
|
30
|
+
@images = Queue.new
|
31
|
+
@options = Default_Options.merge options
|
32
|
+
define_options_helper @options
|
33
|
+
@downloaded_image_count = 0
|
34
|
+
@running = true
|
35
|
+
@downloading = true
|
36
|
+
@success_download = 0
|
37
|
+
@status = "start"
|
38
|
+
@dl_status = "ready"
|
39
|
+
end
|
40
|
+
|
41
|
+
def start
|
42
|
+
Thread.start{parse}
|
43
|
+
download
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse
|
47
|
+
loop do
|
48
|
+
break unless next_parse?
|
49
|
+
sleep interval
|
50
|
+
@status = "get url"
|
51
|
+
url = @urls.shift
|
52
|
+
url = URI.escape url if url.respond_to? :gsub
|
53
|
+
@current_url = URI url
|
54
|
+
begin
|
55
|
+
page = @agent.get url
|
56
|
+
rescue StandardError => e
|
57
|
+
@error_urls << [url,e]
|
58
|
+
puts e
|
59
|
+
next
|
60
|
+
end
|
61
|
+
parse_images page
|
62
|
+
if continue?
|
63
|
+
parse_links page
|
64
|
+
end
|
65
|
+
end
|
66
|
+
@running = false
|
67
|
+
@status = "parser complete"
|
68
|
+
end
|
69
|
+
|
70
|
+
def default_head
|
71
|
+
@_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"}
|
72
|
+
end
|
73
|
+
|
74
|
+
def download
|
75
|
+
@dl_status = "start"
|
76
|
+
@_download_image = 0
|
77
|
+
EM.run do
|
78
|
+
loop do
|
79
|
+
if !running? && (@images.empty? || (image_limit_count && @_download_image >= image_limit_count))
|
80
|
+
@dl_status = "all done"
|
81
|
+
break
|
82
|
+
end
|
83
|
+
if @images.empty?
|
84
|
+
if running?
|
85
|
+
@dl_status = "wait parser"
|
86
|
+
sleep 3
|
87
|
+
redo
|
88
|
+
else
|
89
|
+
next
|
90
|
+
end
|
91
|
+
end
|
92
|
+
@_download_image += 1
|
93
|
+
@dl_status = "shift image url"
|
94
|
+
image_uri = @images.shift
|
95
|
+
@dl_status = "download image #{image_uri}"
|
96
|
+
http = EventMachine::HttpRequest.new(image_uri).get head: default_head
|
97
|
+
http.callback { |res|
|
98
|
+
res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/
|
99
|
+
type = $1
|
100
|
+
if type
|
101
|
+
@success_download += 1
|
102
|
+
save_image type,res.response
|
103
|
+
else
|
104
|
+
@error_urls << [image_uri,"image download error"]
|
105
|
+
end
|
106
|
+
@downloaded_image_count += 1
|
107
|
+
@dl_status = "success: download image #{image_uri}"
|
108
|
+
download_complete? and EM.stop
|
109
|
+
}
|
110
|
+
http.errback { |res|
|
111
|
+
@error_urls << [image_uri,"image download error"]
|
112
|
+
@downloaded_image_count += 1
|
113
|
+
@dl_status = "failed: download image #{image_uri}"
|
114
|
+
download_complete? and EM.stop
|
115
|
+
}
|
116
|
+
end
|
117
|
+
end
|
118
|
+
@dl_status = "download complete"
|
119
|
+
@downloading = false
|
120
|
+
end
|
121
|
+
|
122
|
+
protected
|
123
|
+
def download_complete?
|
124
|
+
!running? && (@images.empty? || (@downloaded_image_count >= @_download_image))
|
125
|
+
end
|
126
|
+
|
127
|
+
def random_file_name
|
128
|
+
SecureRandom.uuid
|
129
|
+
end
|
130
|
+
|
131
|
+
def save_image name = random_file_name,type,content
|
132
|
+
file_name = File.join @save_path,"#{prefix}#{name}.#{type}"
|
133
|
+
File.open(file_name,"w+") do |io|
|
134
|
+
io.binmode
|
135
|
+
io.write content
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def valid_url? url
|
140
|
+
URI url
|
141
|
+
rescue StandardError => e
|
142
|
+
@error_urls << [url,e]
|
143
|
+
false
|
144
|
+
end
|
145
|
+
|
146
|
+
def enq_urls link
|
147
|
+
if !link_dup?(link) && valid_url?(link)
|
148
|
+
@_urls[link] += 1
|
149
|
+
@urls << link
|
150
|
+
@url_count += 1
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def enq_images src
|
155
|
+
if !image_dup?(src) && valid_url?(src)
|
156
|
+
@_imgs[src] += 1
|
157
|
+
@images << src
|
158
|
+
@image_count += 1
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def link_dup? link
|
163
|
+
@_urls.has_key? link
|
164
|
+
end
|
165
|
+
|
166
|
+
def image_dup? src
|
167
|
+
@_imgs.has_key? src
|
168
|
+
end
|
169
|
+
|
170
|
+
def valid_link? link
|
171
|
+
if url_reg?
|
172
|
+
link.to_s =~ url_reg && !link_dup?(link)
|
173
|
+
else
|
174
|
+
!link_dup?(link)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def parse_links page
|
179
|
+
@status = "parse urls"
|
180
|
+
links = page.links.map{|link| link.href.present? and URI.join @current_url,URI.escape(link.href) rescue nil}
|
181
|
+
links.select!{|link| link.present? and valid_link?(link)}
|
182
|
+
links.each{|link| enq_urls link}
|
183
|
+
end
|
184
|
+
|
185
|
+
def parse_images page
|
186
|
+
@status = "parse images"
|
187
|
+
images = page.images.map{|img| img.src.present? and URI.join @current_url,URI.escape(img.src)}
|
188
|
+
images.select!{|img| img.to_s =~ img_reg} if image_reg?
|
189
|
+
images.each{|img| enq_images img}
|
190
|
+
end
|
191
|
+
|
192
|
+
def continue?
|
193
|
+
recursive? && (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true)
|
194
|
+
end
|
195
|
+
|
196
|
+
def next_parse?
|
197
|
+
(image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) && !@urls.empty?
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: img_dl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- jjy
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Download images from web,support regexp and recursive. more args type
|
15
|
+
"img_dl -h"
|
16
|
+
email:
|
17
|
+
- jjyruby@gmail.com
|
18
|
+
executables:
|
19
|
+
- img_dl
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- .gitignore
|
24
|
+
- Gemfile
|
25
|
+
- LICENSE.txt
|
26
|
+
- README.md
|
27
|
+
- Rakefile
|
28
|
+
- bin/img_dl
|
29
|
+
- img_dl.gemspec
|
30
|
+
- lib/img_dl.rb
|
31
|
+
- lib/img_dl/cli.rb
|
32
|
+
- lib/img_dl/helper.rb
|
33
|
+
- lib/img_dl/parser.rb
|
34
|
+
- lib/img_dl/version.rb
|
35
|
+
homepage: http://github.com/jjyr/img_dl
|
36
|
+
licenses: []
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 1.8.24
|
56
|
+
signing_key:
|
57
|
+
specification_version: 3
|
58
|
+
summary: Download images from web,support regexp and recursive,use eventmachine
|
59
|
+
test_files: []
|