RubyGems - img_dl - Versions diffs - 0.0.1 - Mend

img_dl 0.0.1

Files changed (13) hide show

data/.gitignore ADDED

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+*.swp
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED

@@ -0,0 +1,7 @@
+source 'https://rubygems.org'
+gem 'eventmachine'
+gem 'mechanize'
+gem 'active_support'
+# Specify your gem's dependencies in img_dl.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 TODO: Write your name
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,41 @@
+# ImgDl
+A tool download images.
+Support recursive search, regex, async download, etc.
+## Installation
+    $ gem install img_dl
+## Usage
+Usage: img_dl [OPTION]... URL SAVEPATH
+Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
+Example: img_dl http://google.com /home/me/download/icons
+OPTION:
+  -r   Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
+  -ul  Limit recursive urls count, Only if you specified -r
+  -il  Limit download images count
+  -ur  Regex filter for recursive url, Example  -ur www.foo.bai/?page=d+
+  -ir  Regex filter for images,Example  -ir .gif$
+  -pf  Save the file prefix
+  -in  Interval, default value is 0
+  -h   print this help
+  -version  Print version
+Examples :
+    Download 30 pictures from http://sample.tv/image and save to ./images
+    -r mean recursive search and -ur mean only search matched urls
+    `$ img_dl -r -il 30 -ur sample.tv/image/page=\d+ http://sample.tv/image ./images`
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/img_dl ADDED

@@ -0,0 +1,3 @@
+#!/usr/bin/env ruby
+require 'img_dl'
+require 'img_dl/cli'

data/img_dl.gemspec ADDED

@@ -0,0 +1,19 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'img_dl/version'
+Gem::Specification.new do |gem|
+  gem.name          = "img_dl"
+  gem.version       = ImgDl::VERSION
+  gem.authors       = ["jjy"]
+  gem.email         = ["jjyruby@gmail.com"]
+  gem.description   = %q{Download images from web,support regexp and recursive. more args type "img_dl -h"}
+  gem.summary       = %q{Download images from web,support regexp and recursive,use eventmachine}
+  gem.homepage      = "http://github.com/jjyr/img_dl"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = %Q{img_dl}
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+end

data/lib/img_dl.rb ADDED

@@ -0,0 +1,10 @@
+require "img_dl/version"
+require 'mechanize'
+require 'uri'
+require 'eventmachine'
+require 'em-http'
+require_relative 'img_dl/parser'
+module ImgDl
+end

data/lib/img_dl/cli.rb ADDED

@@ -0,0 +1,121 @@
+require 'img_dl'
+require 'fileutils'
+require 'uri'
+module ImgDl
+  module Cli
+    CLEAR = case RUBY_PLATFORM
+            when /win/i, /ming/i
+              "cls"
+            else
+              "clear"
+            end
+    HELP = <<-HELP
+Usage: img_dl [OPTION]... URL SAVEPATH
+Search Images from URL and save to SAVEPATH.You can specified regular expression as a filter or set a limit num,etc.
+Example: img_dl http://google.com /home/me/download/icons
+OPTION:
+  -r   Enable recursive search, when this option parsed you should option -ul or -il to limit search,otherwise the program maybe can't stop
+  -ul  Limit recursive urls count, Only if you specified -r
+  -il  Limit download images count
+  -ur  Regex filter for recursive url, Example  -ur www.foo.bai\/\?page=\d+
+  -ir  Regex filter for images,Example  -ir .gif$
+  -pf  Save the file prefix
+  -in  Interval, default value is 0
+  -h   print this help
+  -version  Print version
+    HELP
+    NOT_MATCH = "Arguments not match!"
+    class << self
+      def not_match
+        puts NOT_MATCH
+        puts HELP
+        exit
+      end
+      def valid_save_path path
+        FileUtils.mkdir_p path
+      end
+      def valid_url url
+        URI url
+      end
+      def parse_to_options args
+        case args.size
+        when 0,1
+          case args.first
+          when '-h'
+            puts HELP
+          when '-version'
+            puts VERSION
+          else
+            not_match
+          end
+        else
+          save_path = args.pop
+          url = args.pop
+          options = {}
+          options[:recursive] = args.delete '-r'
+          not_match if args.size.odd?
+          args.each_slice(2) do |opt,arg|
+            case opt
+            when '-ul'
+              options[:url_limit_count] = arg.to_i
+            when '-il'
+              options[:image_limit_count] = arg.to_i
+            when '-ur'
+              options[:url_reg] = Regexp.new arg
+            when '-ir'
+              options[:image_reg] = Regexp.new arg
+            when '-pf'
+              options[:prefix] = arg
+            when '-in'
+              options[:interval] = arg.to_i
+            else
+              puts "option '#{opt}' not support! please check out img_dl -h"
+            exit
+            end
+          end
+          parser = ImgDl::Parser.new(url,save_path,options)
+          Thread.start{parser.start;puts 'All done.';exit}
+          parser
+        end
+      end
+      def prompt parser
+        system CLEAR
+        puts "url parser status: #{parser.status}"
+        puts "downloader status: #{parser.dl_status}"
+        puts "recursive urls: #{parser.url_count}"
+        puts "images download queue: #{parser.image_count}"
+        puts "downloaded images: #{parser.downloaded_image_count}"
+        puts "successes: #{parser.success_download}"
+        puts "errors: #{parser.error_urls.size}"
+      end
+      def run
+        parse_to_options ARGV
+      end
+    end
+  end
+end
+parser = ImgDl::Cli.run
+if parser
+  at_exit do
+    until parser.error_urls.empty?
+      puts parser.error_urls.shift
+      puts
+    end
+    ImgDl::Cli.prompt parser
+  end
+  $stdout.sync = true
+  loop do
+    ImgDl::Cli.prompt parser
+    sleep 1
+  end
+end

data/lib/img_dl/helper.rb ADDED

@@ -0,0 +1,17 @@
+require 'active_support/core_ext'
+module ImgDl
+  module Helper
+    def define_options_helper options
+      options.each_key do |k|
+        self.class.send :define_method,k do
+          options[k]
+        end
+        self.class.send :define_method,"#{k}?" do
+          options[k].present?
+        end
+      end
+    end
+  end
+end

data/lib/img_dl/parser.rb ADDED

@@ -0,0 +1,201 @@
+require 'thread'
+require 'fileutils'
+require 'active_support/core_ext'
+require 'securerandom'
+require_relative 'helper'
+module ImgDl
+  class Parser
+    include Helper
+    Default_Options = {url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0}
+    attr_reader :agent,:origin_url,:options,:image_count,:url_count,:running,:error_urls,:downloaded_image_count,:success_download,:status,:dl_status
+    alias running? running
+    def initialize url,save_path,options = {}
+      @agent = Mechanize.new
+      @agent.user_agent_alias = 'Linux Mozilla'
+      @origin_url = URI url
+      @current_url = URI url
+      @_urls = Hash.new 0
+      @_imgs = Hash.new 0
+      @save_path = save_path
+      FileUtils.mkdir_p save_path
+      @image_count = 0
+      @url_count = 0
+      @urls = Queue.new
+      @error_urls = Queue.new
+      enq_urls url
+      @images = Queue.new
+      @options = Default_Options.merge options
+      define_options_helper @options
+      @downloaded_image_count = 0
+      @running = true
+      @downloading = true
+      @success_download = 0
+      @status = "start"
+      @dl_status = "ready"
+    end
+    def start
+      Thread.start{parse}
+      download
+    end
+    def parse
+      loop do
+        break unless next_parse?
+        sleep interval
+        @status = "get url"
+        url = @urls.shift
+        url = URI.escape url if url.respond_to? :gsub
+        @current_url = URI url
+        begin
+          page = @agent.get url
+        rescue StandardError => e
+          @error_urls << [url,e]
+          puts e
+          next
+        end
+        parse_images page
+        if continue?
+          parse_links page
+        end
+      end
+      @running = false
+      @status = "parser complete"
+    end
+    def default_head
+      @_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"}
+    end
+    def download
+      @dl_status = "start"
+      @_download_image = 0
+      EM.run do
+        loop do
+          if !running? && (@images.empty? || (image_limit_count && @_download_image >= image_limit_count))
+            @dl_status = "all done"
+            break
+          end
+          if @images.empty?
+            if running?
+              @dl_status = "wait parser"
+              sleep 3
+              redo
+            else
+              next
+            end
+          end
+          @_download_image += 1
+          @dl_status = "shift image url"
+          image_uri = @images.shift
+          @dl_status = "download image #{image_uri}"
+          http = EventMachine::HttpRequest.new(image_uri).get head: default_head
+          http.callback { |res|
+            res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/
+              type = $1
+            if type
+              @success_download += 1
+              save_image type,res.response
+            else
+              @error_urls << [image_uri,"image download error"]
+            end
+            @downloaded_image_count += 1
+            @dl_status = "success: download image #{image_uri}"
+            download_complete? and EM.stop
+          }
+          http.errback  { |res|
+            @error_urls << [image_uri,"image download error"]
+            @downloaded_image_count += 1
+            @dl_status = "failed: download image #{image_uri}"
+            download_complete? and EM.stop
+          }
+        end
+      end
+      @dl_status = "download complete"
+      @downloading = false
+    end
+    protected
+    def download_complete?
+      !running? && (@images.empty? || (@downloaded_image_count >= @_download_image))
+    end
+    def random_file_name
+      SecureRandom.uuid
+    end
+    def save_image name = random_file_name,type,content
+      file_name = File.join @save_path,"#{prefix}#{name}.#{type}"
+      File.open(file_name,"w+") do |io|
+        io.binmode
+        io.write content
+      end
+    end
+    def valid_url? url
+      URI url
+    rescue StandardError => e
+      @error_urls << [url,e]
+      false
+    end
+    def enq_urls link
+      if !link_dup?(link) && valid_url?(link)
+        @_urls[link] += 1
+        @urls << link
+        @url_count += 1
+      end
+    end
+    def enq_images src
+      if !image_dup?(src) && valid_url?(src)
+        @_imgs[src] += 1
+        @images << src
+        @image_count += 1
+      end
+    end
+    def link_dup? link
+      @_urls.has_key? link
+    end
+    def image_dup? src
+      @_imgs.has_key? src
+    end
+    def valid_link? link
+      if url_reg?
+        link.to_s =~ url_reg && !link_dup?(link)
+      else
+        !link_dup?(link)
+      end
+    end
+    def parse_links page
+      @status = "parse urls"
+      links = page.links.map{|link| link.href.present? and URI.join @current_url,URI.escape(link.href) rescue nil}
+      links.select!{|link| link.present? and valid_link?(link)}
+      links.each{|link| enq_urls link}
+    end
+    def parse_images page
+      @status = "parse images"
+      images = page.images.map{|img| img.src.present? and URI.join @current_url,URI.escape(img.src)}
+      images.select!{|img| img.to_s =~ img_reg} if image_reg?
+      images.each{|img| enq_images img}
+    end
+    def continue?
+      recursive? && (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true)
+    end
+    def next_parse?
+      (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) && !@urls.empty?
+    end
+  end
+end

data/lib/img_dl/version.rb ADDED

@@ -0,0 +1,3 @@
+module ImgDl
+  VERSION = "0.0.1"
+end

metadata ADDED

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+name: img_dl
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- jjy
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-01-22 00:00:00.000000000 Z
+dependencies: []
+description: Download images from web,support regexp and recursive. more args type
+  "img_dl -h"
+email:
+- jjyruby@gmail.com
+executables:
+- img_dl
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/img_dl
+- img_dl.gemspec
+- lib/img_dl.rb
+- lib/img_dl/cli.rb
+- lib/img_dl/helper.rb
+- lib/img_dl/parser.rb
+- lib/img_dl/version.rb
+homepage: http://github.com/jjyr/img_dl
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Download images from web,support regexp and recursive,use eventmachine
+test_files: []