img_dl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ *.swp
4
+ .bundle
5
+ .config
6
+ .yardoc
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'eventmachine'
4
+ gem 'mechanize'
5
+ gem 'active_support'
6
+ # Specify your gem's dependencies in img_dl.gemspec
7
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,41 @@
1
+ # ImgDl
2
+
3
+ A tool download images.
4
+ Support recursive search, regex, async download, etc.
5
+
6
+ ## Installation
7
+
8
+ $ gem install img_dl
9
+
10
+ ## Usage
11
+
12
+ Usage: img_dl [OPTION]... URL SAVEPATH
13
+
14
+ Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
15
+
16
+ Example: img_dl http://google.com /home/me/download/icons
17
+
18
+ OPTION:
19
+ -r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
20
+ -ul Limit recursive urls count, Only if you specified -r
21
+ -il Limit download images count
22
+ -ur Regex filter for recursive url, Example -ur www.foo.bai/?page=d+
23
+ -ir Regex filter for images,Example -ir .gif$
24
+ -pf Save the file prefix
25
+ -in Interval, default value is 0
26
+ -h print this help
27
+ -version Print version
28
+
29
+ Examples :
30
+
31
+ Download 30 pictures from http://sample.tv/image and save to ./images
32
+ -r mean recursive search and -ur mean only search matched urls
33
+ `$ img_dl -r -il 30 -ur sample.tv/image/page=\d+ http://sample.tv/image ./images`
34
+
35
+ ## Contributing
36
+
37
+ 1. Fork it
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
40
+ 4. Push to the branch (`git push origin my-new-feature`)
41
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+ require 'img_dl'
3
+ require 'img_dl/cli'
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'img_dl/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "img_dl"
8
+ gem.version = ImgDl::VERSION
9
+ gem.authors = ["jjy"]
10
+ gem.email = ["jjyruby@gmail.com"]
11
+ gem.description = %q{Download images from web,support regexp and recursive. more args type "img_dl -h"}
12
+ gem.summary = %q{Download images from web,support regexp and recursive,use eventmachine}
13
+ gem.homepage = "http://github.com/jjyr/img_dl"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = %Q{img_dl}
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ end
@@ -0,0 +1,10 @@
1
+ require "img_dl/version"
2
+
3
+ require 'mechanize'
4
+ require 'uri'
5
+ require 'eventmachine'
6
+ require 'em-http'
7
+ require_relative 'img_dl/parser'
8
+
9
+ module ImgDl
10
+ end
@@ -0,0 +1,121 @@
1
+ require 'img_dl'
2
+ require 'fileutils'
3
+ require 'uri'
4
+
5
+ module ImgDl
6
+ module Cli
7
+ CLEAR = case RUBY_PLATFORM
8
+ when /win/i, /ming/i
9
+ "cls"
10
+ else
11
+ "clear"
12
+ end
13
+ HELP = <<-HELP
14
+ Usage: img_dl [OPTION]... URL SAVEPATH
15
+
16
+ Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
17
+
18
+ Example: img_dl http://google.com /home/me/download/icons
19
+
20
+ OPTION:
21
+ -r Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
22
+ -ul Limit recursive urls count, Only if you specified -r
23
+ -il Limit download images count
24
+ -ur Regex filter for recursive url, Example -ur www.foo.bai\/\?page=\d+
25
+ -ir Regex filter for images,Example -ir .gif$
26
+ -pf Save the file prefix
27
+ -in Interval, default value is 0
28
+ -h print this help
29
+ -version Print version
30
+ HELP
31
+ NOT_MATCH = "Arguments not match!"
32
+ class << self
33
+ def not_match
34
+ puts NOT_MATCH
35
+ puts HELP
36
+ exit
37
+ end
38
+
39
+ def valid_save_path path
40
+ FileUtils.mkdir_p path
41
+ end
42
+
43
+ def valid_url url
44
+ URI url
45
+ end
46
+
47
+ def parse_to_options args
48
+ case args.size
49
+ when 0,1
50
+ case args.first
51
+ when '-h'
52
+ puts HELP
53
+ when '-version'
54
+ puts VERSION
55
+ else
56
+ not_match
57
+ end
58
+ else
59
+ save_path = args.pop
60
+ url = args.pop
61
+ options = {}
62
+ options[:recursive] = args.delete '-r'
63
+ not_match if args.size.odd?
64
+ args.each_slice(2) do |opt,arg|
65
+ case opt
66
+ when '-ul'
67
+ options[:url_limit_count] = arg.to_i
68
+ when '-il'
69
+ options[:image_limit_count] = arg.to_i
70
+ when '-ur'
71
+ options[:url_reg] = Regexp.new arg
72
+ when '-ir'
73
+ options[:image_reg] = Regexp.new arg
74
+ when '-pf'
75
+ options[:prefix] = arg
76
+ when '-in'
77
+ options[:interval] = arg.to_i
78
+ else
79
+ puts "option '#{opt}' not support! please check out img_dl -h"
80
+ exit
81
+ end
82
+ end
83
+ parser = ImgDl::Parser.new(url,save_path,options)
84
+ Thread.start{parser.start;puts 'All done.';exit}
85
+ parser
86
+ end
87
+ end
88
+
89
+ def prompt parser
90
+ system CLEAR
91
+ puts "url parser status: #{parser.status}"
92
+ puts "downloader status: #{parser.dl_status}"
93
+ puts "recursive urls: #{parser.url_count}"
94
+ puts "images download queue: #{parser.image_count}"
95
+ puts "downloaded images: #{parser.downloaded_image_count}"
96
+ puts "successes: #{parser.success_download}"
97
+ puts "errors: #{parser.error_urls.size}"
98
+ end
99
+
100
+ def run
101
+ parse_to_options ARGV
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ parser = ImgDl::Cli.run
108
+ if parser
109
+ at_exit do
110
+ until parser.error_urls.empty?
111
+ puts parser.error_urls.shift
112
+ puts
113
+ end
114
+ ImgDl::Cli.prompt parser
115
+ end
116
+ $stdout.sync = true
117
+ loop do
118
+ ImgDl::Cli.prompt parser
119
+ sleep 1
120
+ end
121
+ end
@@ -0,0 +1,17 @@
1
+ require 'active_support/core_ext'
2
+
3
+ module ImgDl
4
+ module Helper
5
+ def define_options_helper options
6
+ options.each_key do |k|
7
+ self.class.send :define_method,k do
8
+ options[k]
9
+ end
10
+
11
+ self.class.send :define_method,"#{k}?" do
12
+ options[k].present?
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,201 @@
1
+ require 'thread'
2
+ require 'fileutils'
3
+ require 'active_support/core_ext'
4
+ require 'securerandom'
5
+ require_relative 'helper'
6
+
7
+ module ImgDl
8
+ class Parser
9
+ include Helper
10
+
11
+ Default_Options = {url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0}
12
+
13
+ attr_reader :agent,:origin_url,:options,:image_count,:url_count,:running,:error_urls,:downloaded_image_count,:success_download,:status,:dl_status
14
+ alias running? running
15
+
16
+ def initialize url,save_path,options = {}
17
+ @agent = Mechanize.new
18
+ @agent.user_agent_alias = 'Linux Mozilla'
19
+ @origin_url = URI url
20
+ @current_url = URI url
21
+ @_urls = Hash.new 0
22
+ @_imgs = Hash.new 0
23
+ @save_path = save_path
24
+ FileUtils.mkdir_p save_path
25
+ @image_count = 0
26
+ @url_count = 0
27
+ @urls = Queue.new
28
+ @error_urls = Queue.new
29
+ enq_urls url
30
+ @images = Queue.new
31
+ @options = Default_Options.merge options
32
+ define_options_helper @options
33
+ @downloaded_image_count = 0
34
+ @running = true
35
+ @downloading = true
36
+ @success_download = 0
37
+ @status = "start"
38
+ @dl_status = "ready"
39
+ end
40
+
41
+ def start
42
+ Thread.start{parse}
43
+ download
44
+ end
45
+
46
+ def parse
47
+ loop do
48
+ break unless next_parse?
49
+ sleep interval
50
+ @status = "get url"
51
+ url = @urls.shift
52
+ url = URI.escape url if url.respond_to? :gsub
53
+ @current_url = URI url
54
+ begin
55
+ page = @agent.get url
56
+ rescue StandardError => e
57
+ @error_urls << [url,e]
58
+ puts e
59
+ next
60
+ end
61
+ parse_images page
62
+ if continue?
63
+ parse_links page
64
+ end
65
+ end
66
+ @running = false
67
+ @status = "parser complete"
68
+ end
69
+
70
+ def default_head
71
+ @_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"}
72
+ end
73
+
74
+ def download
75
+ @dl_status = "start"
76
+ @_download_image = 0
77
+ EM.run do
78
+ loop do
79
+ if !running? && (@images.empty? || (image_limit_count && @_download_image >= image_limit_count))
80
+ @dl_status = "all done"
81
+ break
82
+ end
83
+ if @images.empty?
84
+ if running?
85
+ @dl_status = "wait parser"
86
+ sleep 3
87
+ redo
88
+ else
89
+ next
90
+ end
91
+ end
92
+ @_download_image += 1
93
+ @dl_status = "shift image url"
94
+ image_uri = @images.shift
95
+ @dl_status = "download image #{image_uri}"
96
+ http = EventMachine::HttpRequest.new(image_uri).get head: default_head
97
+ http.callback { |res|
98
+ res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/
99
+ type = $1
100
+ if type
101
+ @success_download += 1
102
+ save_image type,res.response
103
+ else
104
+ @error_urls << [image_uri,"image download error"]
105
+ end
106
+ @downloaded_image_count += 1
107
+ @dl_status = "success: download image #{image_uri}"
108
+ download_complete? and EM.stop
109
+ }
110
+ http.errback { |res|
111
+ @error_urls << [image_uri,"image download error"]
112
+ @downloaded_image_count += 1
113
+ @dl_status = "failed: download image #{image_uri}"
114
+ download_complete? and EM.stop
115
+ }
116
+ end
117
+ end
118
+ @dl_status = "download complete"
119
+ @downloading = false
120
+ end
121
+
122
+ protected
123
+ def download_complete?
124
+ !running? && (@images.empty? || (@downloaded_image_count >= @_download_image))
125
+ end
126
+
127
+ def random_file_name
128
+ SecureRandom.uuid
129
+ end
130
+
131
+ def save_image name = random_file_name,type,content
132
+ file_name = File.join @save_path,"#{prefix}#{name}.#{type}"
133
+ File.open(file_name,"w+") do |io|
134
+ io.binmode
135
+ io.write content
136
+ end
137
+ end
138
+
139
+ def valid_url? url
140
+ URI url
141
+ rescue StandardError => e
142
+ @error_urls << [url,e]
143
+ false
144
+ end
145
+
146
+ def enq_urls link
147
+ if !link_dup?(link) && valid_url?(link)
148
+ @_urls[link] += 1
149
+ @urls << link
150
+ @url_count += 1
151
+ end
152
+ end
153
+
154
+ def enq_images src
155
+ if !image_dup?(src) && valid_url?(src)
156
+ @_imgs[src] += 1
157
+ @images << src
158
+ @image_count += 1
159
+ end
160
+ end
161
+
162
+ def link_dup? link
163
+ @_urls.has_key? link
164
+ end
165
+
166
+ def image_dup? src
167
+ @_imgs.has_key? src
168
+ end
169
+
170
+ def valid_link? link
171
+ if url_reg?
172
+ link.to_s =~ url_reg && !link_dup?(link)
173
+ else
174
+ !link_dup?(link)
175
+ end
176
+ end
177
+
178
+ def parse_links page
179
+ @status = "parse urls"
180
+ links = page.links.map{|link| link.href.present? and URI.join @current_url,URI.escape(link.href) rescue nil}
181
+ links.select!{|link| link.present? and valid_link?(link)}
182
+ links.each{|link| enq_urls link}
183
+ end
184
+
185
+ def parse_images page
186
+ @status = "parse images"
187
+ images = page.images.map{|img| img.src.present? and URI.join @current_url,URI.escape(img.src)}
188
+ images.select!{|img| img.to_s =~ img_reg} if image_reg?
189
+ images.each{|img| enq_images img}
190
+ end
191
+
192
+ def continue?
193
+ recursive? && (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true)
194
+ end
195
+
196
+ def next_parse?
197
+ (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) && !@urls.empty?
198
+ end
199
+
200
+ end
201
+ end
@@ -0,0 +1,3 @@
1
+ module ImgDl
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: img_dl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - jjy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Download images from web,support regexp and recursive. more args type
15
+ "img_dl -h"
16
+ email:
17
+ - jjyruby@gmail.com
18
+ executables:
19
+ - img_dl
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - .gitignore
24
+ - Gemfile
25
+ - LICENSE.txt
26
+ - README.md
27
+ - Rakefile
28
+ - bin/img_dl
29
+ - img_dl.gemspec
30
+ - lib/img_dl.rb
31
+ - lib/img_dl/cli.rb
32
+ - lib/img_dl/helper.rb
33
+ - lib/img_dl/parser.rb
34
+ - lib/img_dl/version.rb
35
+ homepage: http://github.com/jjyr/img_dl
36
+ licenses: []
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 1.8.24
56
+ signing_key:
57
+ specification_version: 3
58
+ summary: Download images from web,support regexp and recursive,use eventmachine
59
+ test_files: []