RubyGems - image_downloader - Versions diffs - 0.2.0 → 0.2.1 - Mend

image_downloader 0.2.0 → 0.2.1

Files changed (7) hide show

data/README.rdoc +48 -11
data/bin/download_by_regexp +9 -0
data/lib/image_downloader.rb +21 -5
data/lib/image_downloader/download.rb +4 -4
data/lib/image_downloader/images.rb +9 -7
data/lib/image_downloader/parser.rb +9 -4
metadata +6 -4

data/README.rdoc CHANGED Viewed

@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
   # download image from exect places in page
   downloader.parse(:collect => {:link_icon => true})
+  #####
+  # download images by regexp
+  downloader.parse(:regexp => /[^'"]+\.jpg/i)
   downloader.download()
-For parse method available following options:
-:any_looks_like_image => true
-(find all url which contain image extansion)
+For "parse" method available following options
+  # find all url which contain image extansion
+  :any_looks_like_image => true
+  # find images in specified location
+  :collect => {
+    :all => true, # all image places
+    :(img_src|a_href|style_url|link_icon) => true # specified location
+  }
+  # find by regexp
+  :regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
+  :regexp => /[^'"]+\.jpg/i # the same, but shorter
+  :regexp => /[^'"]+\.css/  # other files can also be downloaded
+  # ignore URLs with images according to given parameters
+  :ignore_without => {:(extension|image_extension) => true}
+  # setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
+  :user_agent => "ruby" # Mozilla/5.0 by default
-:collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
-(find images in):
-* img_src - <img src="url">
-* a_href - <a href="url">
-* style_url - <element style="(background|background-image): url('url')">
-* link_icon - <link rel="shortcut icon" href="url">
+Detailed location description
-:ignore_without => {:(extension|image_extension) => true}
-(ignore URLs with images according to given parameters)
+* img_src   - tag: img,  attribute: src="url"
+* a_href    - tag: a,    attribute: href="url"
+* style_url - tag: any,  attribute: style="(background|background-image): url('url')"
+* link_icon - tag: link, attribute: rel="shortcut icon" href="url"
+For "download" method you can use following directives
+  :parallel => true # for multi thread downloading (this is default if no options)
+  :consequentially => true, # for sequential downloading into a single stream
+  :user_agent => "ruby" # Mozilla/5.0 by default
 == Executables
 You can simply use the executed shell commands:
@@ -68,6 +93,18 @@ For download favicon only
 For download all, that is located in the places for pictures
   download_images url dir/
+For download by regexp
+  download_by_regexp url dir/ "[^'\"]+\\.js"
+== Debugging
+"-d", "--debug"
+To monitor the process of downloading, use the -d flag in the parameters.
+Perhaps there is an error URI::InvalidURIError in some cases.
+  download_images url dir/ -d
 == Copyright
 Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for

data/bin/download_by_regexp ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+require 'image_downloader'
+downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
+downloader.parse(:regexp => Regexp.new(ARGV[2], true))
+downloader.download()

data/lib/image_downloader.rb CHANGED Viewed

@@ -26,6 +26,8 @@ module ImageDownloader
   class Process
     attr_accessor :argument, :images
+    DEFAULT_USER_AGENT = 'Mozilla/5.0'
     def initialize(url, path)
       @argument = Arguments.new(url, path)
       @argument.check
@@ -34,16 +36,21 @@ module ImageDownloader
     end
     # :any_looks_like_image => true
+    # :regexp => /[^'"]+\.jpg/i
     # :ignore_without => {:(extension|image_extension) => true}
     # Nokogiri gem is required:
     # :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
+    # :user_agent => 'Mozilla/5.0'
     def parse(h={:collect => {}, :ignore_without => {}})
       self.rebuild_collect_hash(h)
-      parser = Parser.new(self.argument.url)
+      parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
       if h[:any_looks_like_image]
         parser.get_content_raw
         parser.get_images_raw(self.argument.path, h[:collect])
+      elsif h[:regexp]
+        parser.get_content_raw
+        parser.get_images_regexp(self.argument.path, h[:regexp])
       else
         parser.get_content
         parser.get_images(self.argument.path, h[:collect])
@@ -55,16 +62,25 @@ module ImageDownloader
     end
     # :(parallel|consequentially)
+    # :(parallel|consequentially) => true
+    # :user_agent => 'Mozilla/5.0'
     def download(*args)
-      if !args.first || args.first == :parallel
-        Download.parallel(self.images)
-      elsif args.first == :consequentially
-        Download.consequentially(self.images)
+      user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
+      if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
+        Download.parallel(self.images, user_agent)
+      elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
+        Download.consequentially(self.images, user_agent)
+      else
+        p "Not correct argument for download method"
       end
     end
     protected
+    def args_hash_and_contain(args, sym)
+      ((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
+    end
     def rebuild_collect_hash(h={})
       if !h[:collect] || h[:collect].empty? || h[:collect][:all]
         h[:collect] = Parser.all_image_places

data/lib/image_downloader/download.rb CHANGED Viewed

@@ -1,21 +1,21 @@
 module ImageDownloader
   class Download
-    def self.parallel(images)
+    def self.parallel(images, user_agent)
       threads = []
       for image in images
         threads << Thread.new(image) {|local_image|
           p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
-          local_image.download
+          local_image.download(user_agent)
         }
       end
       threads.each { |aThread|  aThread.join }
     end
-    def self.consequentially(images)
+    def self.consequentially(images, user_agent)
       for image in images
         p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
-        image.download
+        image.download(user_agent)
       end
     end

data/lib/image_downloader/images.rb CHANGED Viewed

@@ -22,7 +22,7 @@ module ImageDownloader
       @absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' +  @src))
     end
-    def download
+    def download(user_agent)
       url = URI.parse(self.absolute_src)
       request = Net::HTTP::Get.new(url.path)
       Net::HTTP.start(url.host) {|http|
@@ -32,15 +32,17 @@ module ImageDownloader
         # - mechanize (main web client), slow
         # - wget, quick, but cannot support some ability (403, 404 responses)
         # - sockets, independent request, quick, but low-level (many lines of code)
-        self.download_by_segment(http,request)
-        # self.download_simple(http,request)
+        self.download_by_segment(http,request,user_agent)
+        # self.download_simple(http,request,user_agent)
       }
+    rescue URI::InvalidURIError
+      p "Error: bad URI: #{self.absolute_src}"  if $debug_option
     end
-    def download_by_segment(http,request)
+    def download_by_segment(http,request,user_agent)
       file = open(self.file_path_name, "wb")
       begin
-        http.request_get(request.path, "User-Agent"=> "Mozilla/5.0") do |response|
+        http.request_get(request.path, "User-Agent"=> user_agent) do |response|
           response.read_body do |segment|
             file.write(segment)
           end
@@ -50,8 +52,8 @@ module ImageDownloader
       end
     end
-    def download_simple(http,request)
-      response = http.get(request.path, "User-Agent"=> "Mozilla/5.0")
+    def download_simple(http,request,user_agent)
+      response = http.get(request.path, "User-Agent"=> user_agent)
       open(self.file_path_name, "wb") { |file|
         file.write(response.body)
       }

data/lib/image_downloader/parser.rb CHANGED Viewed

@@ -14,22 +14,23 @@ end
 module ImageDownloader
   class Parser
-    attr_accessor :url, :argument_url, :content, :images, :images_hash
+    attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
     A_HREF_IMAGE_PREFIX = '_a_href_'
     STYLE_URL_IMAGE_PREFIX = '_style_url_'
     LINK_ICON_IMAGE_PREFIX = '_link_icon_'
     COLLECT_METHODS_PREFIX = 'collect_from_'
-    def initialize(url)
+    def initialize(url, user_agent)
       @argument_url = url
+      @user_agent = user_agent
       @url = URI.parse(url)
       @images = []
       @images_hash = {}
     end
     def get_content_raw
-      @content = open(self.argument_url).read
+      @content = open(self.argument_url, 'User-Agent' => self.user_agent).read
       @content.gsub!(/[\n\r\t]+/,' ')
     end
@@ -40,8 +41,12 @@ module ImageDownloader
       }
     end
+    def get_images_regexp(path,regexp)
+      self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
+    end
     def get_content
-      @content = Nokogiri::HTML(open(self.argument_url))
+      @content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
     end
     def get_images(path,h={})

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: image_downloader
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 21
   prerelease: false
   segments:
   - 0
   - 2
-  - 0
-  version: 0.2.0
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Malykh Oleg
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-07-15 00:00:00 +04:00
+date: 2011-07-19 00:00:00 +04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -100,6 +100,7 @@ executables:
 - download_any_images
 - download_images
 - download_icon
+- download_by_regexp
 extensions: []
 extra_rdoc_files:
@@ -117,6 +118,7 @@ files:
 - bin/download_any_images
 - bin/download_images
 - bin/download_icon
+- bin/download_by_regexp
 has_rdoc: true
 homepage: http://github.com/Fotom/image_downloader
 licenses: