image_downloader 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
40
40
  # download image from exect places in page
41
41
  downloader.parse(:collect => {:link_icon => true})
42
42
 
43
+ #####
44
+ # download images by regexp
45
+ downloader.parse(:regexp => /[^'"]+\.jpg/i)
46
+
43
47
  downloader.download()
44
48
 
45
- For parse method available following options:
46
- :any_looks_like_image => true
47
- (find all url which contain image extansion)
49
+ For "parse" method available following options
50
+
51
+ # find all url which contain image extansion
52
+ :any_looks_like_image => true
53
+
54
+ # find images in specified location
55
+ :collect => {
56
+ :all => true, # all image places
57
+ :(img_src|a_href|style_url|link_icon) => true # specified location
58
+ }
59
+
60
+ # find by regexp
61
+ :regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
62
+ :regexp => /[^'"]+\.jpg/i # the same, but shorter
63
+ :regexp => /[^'"]+\.css/ # other files can also be downloaded
64
+
65
+ # ignore URLs with images according to given parameters
66
+ :ignore_without => {:(extension|image_extension) => true}
67
+
68
+ # setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
69
+ :user_agent => "ruby" # Mozilla/5.0 by default
48
70
 
49
- :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
50
- (find images in):
51
- * img_src - <img src="url">
52
- * a_href - <a href="url">
53
- * style_url - <element style="(background|background-image): url('url')">
54
- * link_icon - <link rel="shortcut icon" href="url">
71
+ Detailed location description
55
72
 
56
- :ignore_without => {:(extension|image_extension) => true}
57
- (ignore URLs with images according to given parameters)
73
+ * img_src - tag: img, attribute: src="url"
74
+ * a_href - tag: a, attribute: href="url"
75
+ * style_url - tag: any, attribute: style="(background|background-image): url('url')"
76
+ * link_icon - tag: link, attribute: rel="shortcut icon" href="url"
77
+
78
+ For "download" method you can use following directives
79
+
80
+ :parallel => true # for multi thread downloading (this is default if no options)
81
+ :consequentially => true, # for sequential downloading into a single stream
82
+ :user_agent => "ruby" # Mozilla/5.0 by default
58
83
 
59
84
  == Executables
60
85
  You can simply use the executed shell commands:
@@ -68,6 +93,18 @@ For download favicon only
68
93
  For download all, that is located in the places for pictures
69
94
  download_images url dir/
70
95
 
96
+ For download by regexp
97
+ download_by_regexp url dir/ "[^'\"]+\\.js"
98
+
99
+ == Debugging
100
+
101
+ "-d", "--debug"
102
+
103
+ To monitor the process of downloading, use the -d flag in the parameters.
104
+ Perhaps there is an error URI::InvalidURIError in some cases.
105
+
106
+ download_images url dir/ -d
107
+
71
108
  == Copyright
72
109
 
73
110
  Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'image_downloader'
4
+
5
+ downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
6
+
7
+ downloader.parse(:regexp => Regexp.new(ARGV[2], true))
8
+
9
+ downloader.download()
@@ -26,6 +26,8 @@ module ImageDownloader
26
26
  class Process
27
27
  attr_accessor :argument, :images
28
28
 
29
+ DEFAULT_USER_AGENT = 'Mozilla/5.0'
30
+
29
31
  def initialize(url, path)
30
32
  @argument = Arguments.new(url, path)
31
33
  @argument.check
@@ -34,16 +36,21 @@ module ImageDownloader
34
36
  end
35
37
 
36
38
  # :any_looks_like_image => true
39
+ # :regexp => /[^'"]+\.jpg/i
37
40
  # :ignore_without => {:(extension|image_extension) => true}
38
41
  # Nokogiri gem is required:
39
42
  # :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
43
+ # :user_agent => 'Mozilla/5.0'
40
44
  def parse(h={:collect => {}, :ignore_without => {}})
41
45
  self.rebuild_collect_hash(h)
42
46
 
43
- parser = Parser.new(self.argument.url)
47
+ parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
44
48
  if h[:any_looks_like_image]
45
49
  parser.get_content_raw
46
50
  parser.get_images_raw(self.argument.path, h[:collect])
51
+ elsif h[:regexp]
52
+ parser.get_content_raw
53
+ parser.get_images_regexp(self.argument.path, h[:regexp])
47
54
  else
48
55
  parser.get_content
49
56
  parser.get_images(self.argument.path, h[:collect])
@@ -55,16 +62,25 @@ module ImageDownloader
55
62
  end
56
63
 
57
64
  # :(parallel|consequentially)
65
+ # :(parallel|consequentially) => true
66
+ # :user_agent => 'Mozilla/5.0'
58
67
  def download(*args)
59
- if !args.first || args.first == :parallel
60
- Download.parallel(self.images)
61
- elsif args.first == :consequentially
62
- Download.consequentially(self.images)
68
+ user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
69
+ if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
70
+ Download.parallel(self.images, user_agent)
71
+ elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
72
+ Download.consequentially(self.images, user_agent)
73
+ else
74
+ p "Not correct argument for download method"
63
75
  end
64
76
  end
65
77
 
66
78
  protected
67
79
 
80
+ def args_hash_and_contain(args, sym)
81
+ ((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
82
+ end
83
+
68
84
  def rebuild_collect_hash(h={})
69
85
  if !h[:collect] || h[:collect].empty? || h[:collect][:all]
70
86
  h[:collect] = Parser.all_image_places
@@ -1,21 +1,21 @@
1
1
  module ImageDownloader
2
2
  class Download
3
3
 
4
- def self.parallel(images)
4
+ def self.parallel(images, user_agent)
5
5
  threads = []
6
6
  for image in images
7
7
  threads << Thread.new(image) {|local_image|
8
8
  p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
9
- local_image.download
9
+ local_image.download(user_agent)
10
10
  }
11
11
  end
12
12
  threads.each { |aThread| aThread.join }
13
13
  end
14
14
 
15
- def self.consequentially(images)
15
+ def self.consequentially(images, user_agent)
16
16
  for image in images
17
17
  p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
18
- image.download
18
+ image.download(user_agent)
19
19
  end
20
20
  end
21
21
 
@@ -22,7 +22,7 @@ module ImageDownloader
22
22
  @absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' + @src))
23
23
  end
24
24
 
25
- def download
25
+ def download(user_agent)
26
26
  url = URI.parse(self.absolute_src)
27
27
  request = Net::HTTP::Get.new(url.path)
28
28
  Net::HTTP.start(url.host) {|http|
@@ -32,15 +32,17 @@ module ImageDownloader
32
32
  # - mechanize (main web client), slow
33
33
  # - wget, quick, but cannot support some ability (403, 404 responses)
34
34
  # - sockets, independent request, quick, but low-level (many lines of code)
35
- self.download_by_segment(http,request)
36
- # self.download_simple(http,request)
35
+ self.download_by_segment(http,request,user_agent)
36
+ # self.download_simple(http,request,user_agent)
37
37
  }
38
+ rescue URI::InvalidURIError
39
+ p "Error: bad URI: #{self.absolute_src}" if $debug_option
38
40
  end
39
41
 
40
- def download_by_segment(http,request)
42
+ def download_by_segment(http,request,user_agent)
41
43
  file = open(self.file_path_name, "wb")
42
44
  begin
43
- http.request_get(request.path, "User-Agent"=> "Mozilla/5.0") do |response|
45
+ http.request_get(request.path, "User-Agent"=> user_agent) do |response|
44
46
  response.read_body do |segment|
45
47
  file.write(segment)
46
48
  end
@@ -50,8 +52,8 @@ module ImageDownloader
50
52
  end
51
53
  end
52
54
 
53
- def download_simple(http,request)
54
- response = http.get(request.path, "User-Agent"=> "Mozilla/5.0")
55
+ def download_simple(http,request,user_agent)
56
+ response = http.get(request.path, "User-Agent"=> user_agent)
55
57
  open(self.file_path_name, "wb") { |file|
56
58
  file.write(response.body)
57
59
  }
@@ -14,22 +14,23 @@ end
14
14
 
15
15
  module ImageDownloader
16
16
  class Parser
17
- attr_accessor :url, :argument_url, :content, :images, :images_hash
17
+ attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
18
18
 
19
19
  A_HREF_IMAGE_PREFIX = '_a_href_'
20
20
  STYLE_URL_IMAGE_PREFIX = '_style_url_'
21
21
  LINK_ICON_IMAGE_PREFIX = '_link_icon_'
22
22
  COLLECT_METHODS_PREFIX = 'collect_from_'
23
23
 
24
- def initialize(url)
24
+ def initialize(url, user_agent)
25
25
  @argument_url = url
26
+ @user_agent = user_agent
26
27
  @url = URI.parse(url)
27
28
  @images = []
28
29
  @images_hash = {}
29
30
  end
30
31
 
31
32
  def get_content_raw
32
- @content = open(self.argument_url).read
33
+ @content = open(self.argument_url, 'User-Agent' => self.user_agent).read
33
34
  @content.gsub!(/[\n\r\t]+/,' ')
34
35
  end
35
36
 
@@ -40,8 +41,12 @@ module ImageDownloader
40
41
  }
41
42
  end
42
43
 
44
+ def get_images_regexp(path,regexp)
45
+ self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
46
+ end
47
+
43
48
  def get_content
44
- @content = Nokogiri::HTML(open(self.argument_url))
49
+ @content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
45
50
  end
46
51
 
47
52
  def get_images(path,h={})
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: image_downloader
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Malykh Oleg
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-15 00:00:00 +04:00
18
+ date: 2011-07-19 00:00:00 +04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -100,6 +100,7 @@ executables:
100
100
  - download_any_images
101
101
  - download_images
102
102
  - download_icon
103
+ - download_by_regexp
103
104
  extensions: []
104
105
 
105
106
  extra_rdoc_files:
@@ -117,6 +118,7 @@ files:
117
118
  - bin/download_any_images
118
119
  - bin/download_images
119
120
  - bin/download_icon
121
+ - bin/download_by_regexp
120
122
  has_rdoc: true
121
123
  homepage: http://github.com/Fotom/image_downloader
122
124
  licenses: