RubyGems - image_downloader - Versions diffs - 0.2.0 → 0.2.1 - Mend

image_downloader 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.rdoc +48 -11
data/bin/download_by_regexp +9 -0
data/lib/image_downloader.rb +21 -5
data/lib/image_downloader/download.rb +4 -4
data/lib/image_downloader/images.rb +9 -7
data/lib/image_downloader/parser.rb +9 -4
metadata +6 -4

data/README.rdoc CHANGED Viewed

@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
   # download image from exect places in page
   downloader.parse(:collect => {:link_icon => true})
+  #####
+  # download images by regexp
+  downloader.parse(:regexp => /[^'"]+\.jpg/i)
   downloader.download()
-For parse method available following options:
-:any_looks_like_image => true
-(find all url which contain image extansion)
+For "parse" method available following options
+  # find all url which contain image extansion
+  :any_looks_like_image => true
+  # find images in specified location
+  :collect => {
+    :all => true, # all image places
+    :(img_src|a_href|style_url|link_icon) => true # specified location
+  }
+  # find by regexp
+  :regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
+  :regexp => /[^'"]+\.jpg/i # the same, but shorter
+  :regexp => /[^'"]+\.css/  # other files can also be downloaded
+  # ignore URLs with images according to given parameters
+  :ignore_without => {:(extension|image_extension) => true}
+  # setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
+  :user_agent => "ruby" # Mozilla/5.0 by default
-:collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
-(find images in):
-* img_src - <img src="url">
-* a_href - <a href="url">
-* style_url - <element style="(background|background-image): url('url')">
-* link_icon - <link rel="shortcut icon" href="url">
+Detailed location description
-:ignore_without => {:(extension|image_extension) => true}
-(ignore URLs with images according to given parameters)
+* img_src   - tag: img,  attribute: src="url"
+* a_href    - tag: a,    attribute: href="url"
+* style_url - tag: any,  attribute: style="(background|background-image): url('url')"
+* link_icon - tag: link, attribute: rel="shortcut icon" href="url"
+For "download" method you can use following directives
+  :parallel => true # for multi thread downloading (this is default if no options)
+  :consequentially => true, # for sequential downloading into a single stream
+  :user_agent => "ruby" # Mozilla/5.0 by default
 == Executables
 You can simply use the executed shell commands:
@@ -68,6 +93,18 @@ For download favicon only
 For download all, that is located in the places for pictures
   download_images url dir/
+For download by regexp
+  download_by_regexp url dir/ "[^'\"]+\\.js"
+== Debugging
+"-d", "--debug"
+To monitor the process of downloading, use the -d flag in the parameters.
+Perhaps there is an error URI::InvalidURIError in some cases.
+  download_images url dir/ -d
 == Copyright
 Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for

data/bin/download_by_regexp ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+require 'image_downloader'
+downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
+downloader.parse(:regexp => Regexp.new(ARGV[2], true))
+downloader.download()

data/lib/image_downloader.rb CHANGED Viewed

@@ -26,6 +26,8 @@ module ImageDownloader
   class Process
     attr_accessor :argument, :images
+    DEFAULT_USER_AGENT = 'Mozilla/5.0'
     def initialize(url, path)
       @argument = Arguments.new(url, path)
       @argument.check
@@ -34,16 +36,21 @@ module ImageDownloader
     end
     # :any_looks_like_image => true
+    # :regexp => /[^'"]+\.jpg/i
     # :ignore_without => {:(extension|image_extension) => true}
     # Nokogiri gem is required:
     # :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
+    # :user_agent => 'Mozilla/5.0'
     def parse(h={:collect => {}, :ignore_without => {}})
       self.rebuild_collect_hash(h)
-      parser = Parser.new(self.argument.url)
+      parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
       if h[:any_looks_like_image]
         parser.get_content_raw
         parser.get_images_raw(self.argument.path, h[:collect])
+      elsif h[:regexp]
+        parser.get_content_raw
+        parser.get_images_regexp(self.argument.path, h[:regexp])
       else
         parser.get_content
         parser.get_images(self.argument.path, h[:collect])
@@ -55,16 +62,25 @@ module ImageDownloader
     end
     # :(parallel|consequentially)
+    # :(parallel|consequentially) => true
+    # :user_agent => 'Mozilla/5.0'
     def download(*args)
-      if !args.first || args.first == :parallel
-        Download.parallel(self.images)
-      elsif args.first == :consequentially
-        Download.consequentially(self.images)
+      user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
+      if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
+        Download.parallel(self.images, user_agent)
+      elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
+        Download.consequentially(self.images, user_agent)
+      else
+        p "Not correct argument for download method"
       end
     end
     protected
+    def args_hash_and_contain(args, sym)
+      ((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
+    end
     def rebuild_collect_hash(h={})
       if !h[:collect] || h[:collect].empty? || h[:collect][:all]
         h[:collect] = Parser.all_image_places

data/lib/image_downloader/download.rb CHANGED Viewed

@@ -1,21 +1,21 @@
 module ImageDownloader
   class Download
-    def self.parallel(images)
+    def self.parallel(images, user_agent)
       threads = []
       for image in images
         threads << Thread.new(image) {|local_image|
           p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
-          local_image.download
+          local_image.download(user_agent)
         }
       end
       threads.each { |aThread|  aThread.join }
     end
-    def self.consequentially(images)
+    def self.consequentially(images, user_agent)
       for image in images
         p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
-        image.download
+        image.download(user_agent)
       end
     end

data/lib/image_downloader/images.rb CHANGED Viewed

@@ -22,7 +22,7 @@ module ImageDownloader
       @absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' +  @src))
     end
-    def download
+    def download(user_agent)
       url = URI.parse(self.absolute_src)
       request = Net::HTTP::Get.new(url.path)
       Net::HTTP.start(url.host) {|http|
@@ -32,15 +32,17 @@ module ImageDownloader
         # - mechanize (main web client), slow
         # - wget, quick, but cannot support some ability (403, 404 responses)
         # - sockets, independent request, quick, but low-level (many lines of code)
-        self.download_by_segment(http,request)
-        # self.download_simple(http,request)
+        self.download_by_segment(http,request,user_agent)
+        # self.download_simple(http,request,user_agent)
       }
+    rescue URI::InvalidURIError
+      p "Error: bad URI: #{self.absolute_src}"  if $debug_option
     end
-    def download_by_segment(http,request)
+    def download_by_segment(http,request,user_agent)
       file = open(self.file_path_name, "wb")
       begin
-        http.request_get(request.path, "User-Agent"=> "Mozilla/5.0") do |response|
+        http.request_get(request.path, "User-Agent"=> user_agent) do |response|
           response.read_body do |segment|
             file.write(segment)
           end
@@ -50,8 +52,8 @@ module ImageDownloader
       end
     end
-    def download_simple(http,request)
-      response = http.get(request.path, "User-Agent"=> "Mozilla/5.0")
+    def download_simple(http,request,user_agent)
+      response = http.get(request.path, "User-Agent"=> user_agent)
       open(self.file_path_name, "wb") { |file|
         file.write(response.body)
       }

data/lib/image_downloader/parser.rb CHANGED Viewed

@@ -14,22 +14,23 @@ end
 module ImageDownloader
   class Parser
-    attr_accessor :url, :argument_url, :content, :images, :images_hash
+    attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
     A_HREF_IMAGE_PREFIX = '_a_href_'
     STYLE_URL_IMAGE_PREFIX = '_style_url_'
     LINK_ICON_IMAGE_PREFIX = '_link_icon_'
     COLLECT_METHODS_PREFIX = 'collect_from_'
-    def initialize(url)
+    def initialize(url, user_agent)
       @argument_url = url
+      @user_agent = user_agent
       @url = URI.parse(url)
       @images = []
       @images_hash = {}
     end
     def get_content_raw
-      @content = open(self.argument_url).read
+      @content = open(self.argument_url, 'User-Agent' => self.user_agent).read
       @content.gsub!(/[\n\r\t]+/,' ')
     end
@@ -40,8 +41,12 @@ module ImageDownloader
       }
     end
+    def get_images_regexp(path,regexp)
+      self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
+    end
     def get_content
-      @content = Nokogiri::HTML(open(self.argument_url))
+      @content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
     end
     def get_images(path,h={})

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: image_downloader
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 21
   prerelease: false
   segments:
   - 0
   - 2
-  - 0
-  version: 0.2.0
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Malykh Oleg
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-07-15 00:00:00 +04:00
+date: 2011-07-19 00:00:00 +04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -100,6 +100,7 @@ executables:
 - download_any_images
 - download_images
 - download_icon
+- download_by_regexp
 extensions: []
 extra_rdoc_files:
@@ -117,6 +118,7 @@ files:
 - bin/download_any_images
 - bin/download_images
 - bin/download_icon
+- bin/download_by_regexp
 has_rdoc: true
 homepage: http://github.com/Fotom/image_downloader
 licenses: