image_downloader 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +48 -11
- data/bin/download_by_regexp +9 -0
- data/lib/image_downloader.rb +21 -5
- data/lib/image_downloader/download.rb +4 -4
- data/lib/image_downloader/images.rb +9 -7
- data/lib/image_downloader/parser.rb +9 -4
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
|
|
40
40
|
# download image from exect places in page
|
41
41
|
downloader.parse(:collect => {:link_icon => true})
|
42
42
|
|
43
|
+
#####
|
44
|
+
# download images by regexp
|
45
|
+
downloader.parse(:regexp => /[^'"]+\.jpg/i)
|
46
|
+
|
43
47
|
downloader.download()
|
44
48
|
|
45
|
-
For parse method available following options
|
46
|
-
|
47
|
-
|
49
|
+
For "parse" method available following options
|
50
|
+
|
51
|
+
# find all url which contain image extansion
|
52
|
+
:any_looks_like_image => true
|
53
|
+
|
54
|
+
# find images in specified location
|
55
|
+
:collect => {
|
56
|
+
:all => true, # all image places
|
57
|
+
:(img_src|a_href|style_url|link_icon) => true # specified location
|
58
|
+
}
|
59
|
+
|
60
|
+
# find by regexp
|
61
|
+
:regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
|
62
|
+
:regexp => /[^'"]+\.jpg/i # the same, but shorter
|
63
|
+
:regexp => /[^'"]+\.css/ # other files can also be downloaded
|
64
|
+
|
65
|
+
# ignore URLs with images according to given parameters
|
66
|
+
:ignore_without => {:(extension|image_extension) => true}
|
67
|
+
|
68
|
+
# setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
|
69
|
+
:user_agent => "ruby" # Mozilla/5.0 by default
|
48
70
|
|
49
|
-
|
50
|
-
(find images in):
|
51
|
-
* img_src - <img src="url">
|
52
|
-
* a_href - <a href="url">
|
53
|
-
* style_url - <element style="(background|background-image): url('url')">
|
54
|
-
* link_icon - <link rel="shortcut icon" href="url">
|
71
|
+
Detailed location description
|
55
72
|
|
56
|
-
|
57
|
-
|
73
|
+
* img_src - tag: img, attribute: src="url"
|
74
|
+
* a_href - tag: a, attribute: href="url"
|
75
|
+
* style_url - tag: any, attribute: style="(background|background-image): url('url')"
|
76
|
+
* link_icon - tag: link, attribute: rel="shortcut icon" href="url"
|
77
|
+
|
78
|
+
For "download" method you can use following directives
|
79
|
+
|
80
|
+
:parallel => true # for multi thread downloading (this is default if no options)
|
81
|
+
:consequentially => true, # for sequential downloading into a single stream
|
82
|
+
:user_agent => "ruby" # Mozilla/5.0 by default
|
58
83
|
|
59
84
|
== Executables
|
60
85
|
You can simply use the executed shell commands:
|
@@ -68,6 +93,18 @@ For download favicon only
|
|
68
93
|
For download all, that is located in the places for pictures
|
69
94
|
download_images url dir/
|
70
95
|
|
96
|
+
For download by regexp
|
97
|
+
download_by_regexp url dir/ "[^'\"]+\\.js"
|
98
|
+
|
99
|
+
== Debugging
|
100
|
+
|
101
|
+
"-d", "--debug"
|
102
|
+
|
103
|
+
To monitor the process of downloading, use the -d flag in the parameters.
|
104
|
+
Perhaps there is an error URI::InvalidURIError in some cases.
|
105
|
+
|
106
|
+
download_images url dir/ -d
|
107
|
+
|
71
108
|
== Copyright
|
72
109
|
|
73
110
|
Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for
|
data/lib/image_downloader.rb
CHANGED
@@ -26,6 +26,8 @@ module ImageDownloader
|
|
26
26
|
class Process
|
27
27
|
attr_accessor :argument, :images
|
28
28
|
|
29
|
+
DEFAULT_USER_AGENT = 'Mozilla/5.0'
|
30
|
+
|
29
31
|
def initialize(url, path)
|
30
32
|
@argument = Arguments.new(url, path)
|
31
33
|
@argument.check
|
@@ -34,16 +36,21 @@ module ImageDownloader
|
|
34
36
|
end
|
35
37
|
|
36
38
|
# :any_looks_like_image => true
|
39
|
+
# :regexp => /[^'"]+\.jpg/i
|
37
40
|
# :ignore_without => {:(extension|image_extension) => true}
|
38
41
|
# Nokogiri gem is required:
|
39
42
|
# :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
|
43
|
+
# :user_agent => 'Mozilla/5.0'
|
40
44
|
def parse(h={:collect => {}, :ignore_without => {}})
|
41
45
|
self.rebuild_collect_hash(h)
|
42
46
|
|
43
|
-
parser = Parser.new(self.argument.url)
|
47
|
+
parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
|
44
48
|
if h[:any_looks_like_image]
|
45
49
|
parser.get_content_raw
|
46
50
|
parser.get_images_raw(self.argument.path, h[:collect])
|
51
|
+
elsif h[:regexp]
|
52
|
+
parser.get_content_raw
|
53
|
+
parser.get_images_regexp(self.argument.path, h[:regexp])
|
47
54
|
else
|
48
55
|
parser.get_content
|
49
56
|
parser.get_images(self.argument.path, h[:collect])
|
@@ -55,16 +62,25 @@ module ImageDownloader
|
|
55
62
|
end
|
56
63
|
|
57
64
|
# :(parallel|consequentially)
|
65
|
+
# :(parallel|consequentially) => true
|
66
|
+
# :user_agent => 'Mozilla/5.0'
|
58
67
|
def download(*args)
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
68
|
+
user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
|
69
|
+
if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
|
70
|
+
Download.parallel(self.images, user_agent)
|
71
|
+
elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
|
72
|
+
Download.consequentially(self.images, user_agent)
|
73
|
+
else
|
74
|
+
p "Not correct argument for download method"
|
63
75
|
end
|
64
76
|
end
|
65
77
|
|
66
78
|
protected
|
67
79
|
|
80
|
+
def args_hash_and_contain(args, sym)
|
81
|
+
((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
|
82
|
+
end
|
83
|
+
|
68
84
|
def rebuild_collect_hash(h={})
|
69
85
|
if !h[:collect] || h[:collect].empty? || h[:collect][:all]
|
70
86
|
h[:collect] = Parser.all_image_places
|
@@ -1,21 +1,21 @@
|
|
1
1
|
module ImageDownloader
|
2
2
|
class Download
|
3
3
|
|
4
|
-
def self.parallel(images)
|
4
|
+
def self.parallel(images, user_agent)
|
5
5
|
threads = []
|
6
6
|
for image in images
|
7
7
|
threads << Thread.new(image) {|local_image|
|
8
8
|
p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
|
9
|
-
local_image.download
|
9
|
+
local_image.download(user_agent)
|
10
10
|
}
|
11
11
|
end
|
12
12
|
threads.each { |aThread| aThread.join }
|
13
13
|
end
|
14
14
|
|
15
|
-
def self.consequentially(images)
|
15
|
+
def self.consequentially(images, user_agent)
|
16
16
|
for image in images
|
17
17
|
p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
|
18
|
-
image.download
|
18
|
+
image.download(user_agent)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -22,7 +22,7 @@ module ImageDownloader
|
|
22
22
|
@absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' + @src))
|
23
23
|
end
|
24
24
|
|
25
|
-
def download
|
25
|
+
def download(user_agent)
|
26
26
|
url = URI.parse(self.absolute_src)
|
27
27
|
request = Net::HTTP::Get.new(url.path)
|
28
28
|
Net::HTTP.start(url.host) {|http|
|
@@ -32,15 +32,17 @@ module ImageDownloader
|
|
32
32
|
# - mechanize (main web client), slow
|
33
33
|
# - wget, quick, but cannot support some ability (403, 404 responses)
|
34
34
|
# - sockets, independent request, quick, but low-level (many lines of code)
|
35
|
-
self.download_by_segment(http,request)
|
36
|
-
# self.download_simple(http,request)
|
35
|
+
self.download_by_segment(http,request,user_agent)
|
36
|
+
# self.download_simple(http,request,user_agent)
|
37
37
|
}
|
38
|
+
rescue URI::InvalidURIError
|
39
|
+
p "Error: bad URI: #{self.absolute_src}" if $debug_option
|
38
40
|
end
|
39
41
|
|
40
|
-
def download_by_segment(http,request)
|
42
|
+
def download_by_segment(http,request,user_agent)
|
41
43
|
file = open(self.file_path_name, "wb")
|
42
44
|
begin
|
43
|
-
http.request_get(request.path, "User-Agent"=>
|
45
|
+
http.request_get(request.path, "User-Agent"=> user_agent) do |response|
|
44
46
|
response.read_body do |segment|
|
45
47
|
file.write(segment)
|
46
48
|
end
|
@@ -50,8 +52,8 @@ module ImageDownloader
|
|
50
52
|
end
|
51
53
|
end
|
52
54
|
|
53
|
-
def download_simple(http,request)
|
54
|
-
response = http.get(request.path, "User-Agent"=>
|
55
|
+
def download_simple(http,request,user_agent)
|
56
|
+
response = http.get(request.path, "User-Agent"=> user_agent)
|
55
57
|
open(self.file_path_name, "wb") { |file|
|
56
58
|
file.write(response.body)
|
57
59
|
}
|
@@ -14,22 +14,23 @@ end
|
|
14
14
|
|
15
15
|
module ImageDownloader
|
16
16
|
class Parser
|
17
|
-
attr_accessor :url, :argument_url, :content, :images, :images_hash
|
17
|
+
attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
|
18
18
|
|
19
19
|
A_HREF_IMAGE_PREFIX = '_a_href_'
|
20
20
|
STYLE_URL_IMAGE_PREFIX = '_style_url_'
|
21
21
|
LINK_ICON_IMAGE_PREFIX = '_link_icon_'
|
22
22
|
COLLECT_METHODS_PREFIX = 'collect_from_'
|
23
23
|
|
24
|
-
def initialize(url)
|
24
|
+
def initialize(url, user_agent)
|
25
25
|
@argument_url = url
|
26
|
+
@user_agent = user_agent
|
26
27
|
@url = URI.parse(url)
|
27
28
|
@images = []
|
28
29
|
@images_hash = {}
|
29
30
|
end
|
30
31
|
|
31
32
|
def get_content_raw
|
32
|
-
@content = open(self.argument_url).read
|
33
|
+
@content = open(self.argument_url, 'User-Agent' => self.user_agent).read
|
33
34
|
@content.gsub!(/[\n\r\t]+/,' ')
|
34
35
|
end
|
35
36
|
|
@@ -40,8 +41,12 @@ module ImageDownloader
|
|
40
41
|
}
|
41
42
|
end
|
42
43
|
|
44
|
+
def get_images_regexp(path,regexp)
|
45
|
+
self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
|
46
|
+
end
|
47
|
+
|
43
48
|
def get_content
|
44
|
-
@content = Nokogiri::HTML(open(self.argument_url))
|
49
|
+
@content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
|
45
50
|
end
|
46
51
|
|
47
52
|
def get_images(path,h={})
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: image_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Malykh Oleg
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-07-
|
18
|
+
date: 2011-07-19 00:00:00 +04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -100,6 +100,7 @@ executables:
|
|
100
100
|
- download_any_images
|
101
101
|
- download_images
|
102
102
|
- download_icon
|
103
|
+
- download_by_regexp
|
103
104
|
extensions: []
|
104
105
|
|
105
106
|
extra_rdoc_files:
|
@@ -117,6 +118,7 @@ files:
|
|
117
118
|
- bin/download_any_images
|
118
119
|
- bin/download_images
|
119
120
|
- bin/download_icon
|
121
|
+
- bin/download_by_regexp
|
120
122
|
has_rdoc: true
|
121
123
|
homepage: http://github.com/Fotom/image_downloader
|
122
124
|
licenses:
|