image_downloader 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
40
40
  # download image from exect places in page
41
41
  downloader.parse(:collect => {:link_icon => true})
42
42
 
43
+ #####
44
+ # download images by regexp
45
+ downloader.parse(:regexp => /[^'"]+\.jpg/i)
46
+
43
47
  downloader.download()
44
48
 
45
- For parse method available following options:
46
- :any_looks_like_image => true
47
- (find all url which contain image extansion)
49
+ For "parse" method available following options
50
+
51
+ # find all url which contain image extansion
52
+ :any_looks_like_image => true
53
+
54
+ # find images in specified location
55
+ :collect => {
56
+ :all => true, # all image places
57
+ :(img_src|a_href|style_url|link_icon) => true # specified location
58
+ }
59
+
60
+ # find by regexp
61
+ :regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
62
+ :regexp => /[^'"]+\.jpg/i # the same, but shorter
63
+ :regexp => /[^'"]+\.css/ # other files can also be downloaded
64
+
65
+ # ignore URLs with images according to given parameters
66
+ :ignore_without => {:(extension|image_extension) => true}
67
+
68
+ # setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
69
+ :user_agent => "ruby" # Mozilla/5.0 by default
48
70
 
49
- :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
50
- (find images in):
51
- * img_src - <img src="url">
52
- * a_href - <a href="url">
53
- * style_url - <element style="(background|background-image): url('url')">
54
- * link_icon - <link rel="shortcut icon" href="url">
71
+ Detailed location description
55
72
 
56
- :ignore_without => {:(extension|image_extension) => true}
57
- (ignore URLs with images according to given parameters)
73
+ * img_src - tag: img, attribute: src="url"
74
+ * a_href - tag: a, attribute: href="url"
75
+ * style_url - tag: any, attribute: style="(background|background-image): url('url')"
76
+ * link_icon - tag: link, attribute: rel="shortcut icon" href="url"
77
+
78
+ For "download" method you can use following directives
79
+
80
+ :parallel => true # for multi thread downloading (this is default if no options)
81
+ :consequentially => true, # for sequential downloading into a single stream
82
+ :user_agent => "ruby" # Mozilla/5.0 by default
58
83
 
59
84
  == Executables
60
85
  You can simply use the executed shell commands:
@@ -68,6 +93,18 @@ For download favicon only
68
93
  For download all, that is located in the places for pictures
69
94
  download_images url dir/
70
95
 
96
+ For download by regexp
97
+ download_by_regexp url dir/ "[^'\"]+\\.js"
98
+
99
+ == Debugging
100
+
101
+ "-d", "--debug"
102
+
103
+ To monitor the process of downloading, use the -d flag in the parameters.
104
+ Perhaps there is an error URI::InvalidURIError in some cases.
105
+
106
+ download_images url dir/ -d
107
+
71
108
  == Copyright
72
109
 
73
110
  Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'image_downloader'
4
+
5
+ downloader = ImageDownloader::Process.new(ARGV[0],ARGV[1])
6
+
7
+ downloader.parse(:regexp => Regexp.new(ARGV[2], true))
8
+
9
+ downloader.download()
@@ -26,6 +26,8 @@ module ImageDownloader
26
26
  class Process
27
27
  attr_accessor :argument, :images
28
28
 
29
+ DEFAULT_USER_AGENT = 'Mozilla/5.0'
30
+
29
31
  def initialize(url, path)
30
32
  @argument = Arguments.new(url, path)
31
33
  @argument.check
@@ -34,16 +36,21 @@ module ImageDownloader
34
36
  end
35
37
 
36
38
  # :any_looks_like_image => true
39
+ # :regexp => /[^'"]+\.jpg/i
37
40
  # :ignore_without => {:(extension|image_extension) => true}
38
41
  # Nokogiri gem is required:
39
42
  # :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
43
+ # :user_agent => 'Mozilla/5.0'
40
44
  def parse(h={:collect => {}, :ignore_without => {}})
41
45
  self.rebuild_collect_hash(h)
42
46
 
43
- parser = Parser.new(self.argument.url)
47
+ parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
44
48
  if h[:any_looks_like_image]
45
49
  parser.get_content_raw
46
50
  parser.get_images_raw(self.argument.path, h[:collect])
51
+ elsif h[:regexp]
52
+ parser.get_content_raw
53
+ parser.get_images_regexp(self.argument.path, h[:regexp])
47
54
  else
48
55
  parser.get_content
49
56
  parser.get_images(self.argument.path, h[:collect])
@@ -55,16 +62,25 @@ module ImageDownloader
55
62
  end
56
63
 
57
64
  # :(parallel|consequentially)
65
+ # :(parallel|consequentially) => true
66
+ # :user_agent => 'Mozilla/5.0'
58
67
  def download(*args)
59
- if !args.first || args.first == :parallel
60
- Download.parallel(self.images)
61
- elsif args.first == :consequentially
62
- Download.consequentially(self.images)
68
+ user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
69
+ if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
70
+ Download.parallel(self.images, user_agent)
71
+ elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
72
+ Download.consequentially(self.images, user_agent)
73
+ else
74
+ p "Not correct argument for download method"
63
75
  end
64
76
  end
65
77
 
66
78
  protected
67
79
 
80
+ def args_hash_and_contain(args, sym)
81
+ ((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
82
+ end
83
+
68
84
  def rebuild_collect_hash(h={})
69
85
  if !h[:collect] || h[:collect].empty? || h[:collect][:all]
70
86
  h[:collect] = Parser.all_image_places
@@ -1,21 +1,21 @@
1
1
  module ImageDownloader
2
2
  class Download
3
3
 
4
- def self.parallel(images)
4
+ def self.parallel(images, user_agent)
5
5
  threads = []
6
6
  for image in images
7
7
  threads << Thread.new(image) {|local_image|
8
8
  p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
9
- local_image.download
9
+ local_image.download(user_agent)
10
10
  }
11
11
  end
12
12
  threads.each { |aThread| aThread.join }
13
13
  end
14
14
 
15
- def self.consequentially(images)
15
+ def self.consequentially(images, user_agent)
16
16
  for image in images
17
17
  p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
18
- image.download
18
+ image.download(user_agent)
19
19
  end
20
20
  end
21
21
 
@@ -22,7 +22,7 @@ module ImageDownloader
22
22
  @absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' + @src))
23
23
  end
24
24
 
25
- def download
25
+ def download(user_agent)
26
26
  url = URI.parse(self.absolute_src)
27
27
  request = Net::HTTP::Get.new(url.path)
28
28
  Net::HTTP.start(url.host) {|http|
@@ -32,15 +32,17 @@ module ImageDownloader
32
32
  # - mechanize (main web client), slow
33
33
  # - wget, quick, but cannot support some ability (403, 404 responses)
34
34
  # - sockets, independent request, quick, but low-level (many lines of code)
35
- self.download_by_segment(http,request)
36
- # self.download_simple(http,request)
35
+ self.download_by_segment(http,request,user_agent)
36
+ # self.download_simple(http,request,user_agent)
37
37
  }
38
+ rescue URI::InvalidURIError
39
+ p "Error: bad URI: #{self.absolute_src}" if $debug_option
38
40
  end
39
41
 
40
- def download_by_segment(http,request)
42
+ def download_by_segment(http,request,user_agent)
41
43
  file = open(self.file_path_name, "wb")
42
44
  begin
43
- http.request_get(request.path, "User-Agent"=> "Mozilla/5.0") do |response|
45
+ http.request_get(request.path, "User-Agent"=> user_agent) do |response|
44
46
  response.read_body do |segment|
45
47
  file.write(segment)
46
48
  end
@@ -50,8 +52,8 @@ module ImageDownloader
50
52
  end
51
53
  end
52
54
 
53
- def download_simple(http,request)
54
- response = http.get(request.path, "User-Agent"=> "Mozilla/5.0")
55
+ def download_simple(http,request,user_agent)
56
+ response = http.get(request.path, "User-Agent"=> user_agent)
55
57
  open(self.file_path_name, "wb") { |file|
56
58
  file.write(response.body)
57
59
  }
@@ -14,22 +14,23 @@ end
14
14
 
15
15
  module ImageDownloader
16
16
  class Parser
17
- attr_accessor :url, :argument_url, :content, :images, :images_hash
17
+ attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
18
18
 
19
19
  A_HREF_IMAGE_PREFIX = '_a_href_'
20
20
  STYLE_URL_IMAGE_PREFIX = '_style_url_'
21
21
  LINK_ICON_IMAGE_PREFIX = '_link_icon_'
22
22
  COLLECT_METHODS_PREFIX = 'collect_from_'
23
23
 
24
- def initialize(url)
24
+ def initialize(url, user_agent)
25
25
  @argument_url = url
26
+ @user_agent = user_agent
26
27
  @url = URI.parse(url)
27
28
  @images = []
28
29
  @images_hash = {}
29
30
  end
30
31
 
31
32
  def get_content_raw
32
- @content = open(self.argument_url).read
33
+ @content = open(self.argument_url, 'User-Agent' => self.user_agent).read
33
34
  @content.gsub!(/[\n\r\t]+/,' ')
34
35
  end
35
36
 
@@ -40,8 +41,12 @@ module ImageDownloader
40
41
  }
41
42
  end
42
43
 
44
+ def get_images_regexp(path,regexp)
45
+ self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
46
+ end
47
+
43
48
  def get_content
44
- @content = Nokogiri::HTML(open(self.argument_url))
49
+ @content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
45
50
  end
46
51
 
47
52
  def get_images(path,h={})
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: image_downloader
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 1
10
+ version: 0.2.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Malykh Oleg
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-15 00:00:00 +04:00
18
+ date: 2011-07-19 00:00:00 +04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -100,6 +100,7 @@ executables:
100
100
  - download_any_images
101
101
  - download_images
102
102
  - download_icon
103
+ - download_by_regexp
103
104
  extensions: []
104
105
 
105
106
  extra_rdoc_files:
@@ -117,6 +118,7 @@ files:
117
118
  - bin/download_any_images
118
119
  - bin/download_images
119
120
  - bin/download_icon
121
+ - bin/download_by_regexp
120
122
  has_rdoc: true
121
123
  homepage: http://github.com/Fotom/image_downloader
122
124
  licenses: