image_downloader 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +48 -11
- data/bin/download_by_regexp +9 -0
- data/lib/image_downloader.rb +21 -5
- data/lib/image_downloader/download.rb +4 -4
- data/lib/image_downloader/images.rb +9 -7
- data/lib/image_downloader/parser.rb +9 -4
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -40,21 +40,46 @@ After installation, you can use the following code as an example:
|
|
40
40
|
# download image from exect places in page
|
41
41
|
downloader.parse(:collect => {:link_icon => true})
|
42
42
|
|
43
|
+
#####
|
44
|
+
# download images by regexp
|
45
|
+
downloader.parse(:regexp => /[^'"]+\.jpg/i)
|
46
|
+
|
43
47
|
downloader.download()
|
44
48
|
|
45
|
-
For parse method available following options
|
46
|
-
|
47
|
-
|
49
|
+
For "parse" method available following options
|
50
|
+
|
51
|
+
# find all url which contain image extansion
|
52
|
+
:any_looks_like_image => true
|
53
|
+
|
54
|
+
# find images in specified location
|
55
|
+
:collect => {
|
56
|
+
:all => true, # all image places
|
57
|
+
:(img_src|a_href|style_url|link_icon) => true # specified location
|
58
|
+
}
|
59
|
+
|
60
|
+
# find by regexp
|
61
|
+
:regexp => /['"]([^'"]+\.jpg)[^'"]*['"]/i)
|
62
|
+
:regexp => /[^'"]+\.jpg/i # the same, but shorter
|
63
|
+
:regexp => /[^'"]+\.css/ # other files can also be downloaded
|
64
|
+
|
65
|
+
# ignore URLs with images according to given parameters
|
66
|
+
:ignore_without => {:(extension|image_extension) => true}
|
67
|
+
|
68
|
+
# setting the favorite User-Agent (vary important for exclude 403, 404... responses from server)
|
69
|
+
:user_agent => "ruby" # Mozilla/5.0 by default
|
48
70
|
|
49
|
-
|
50
|
-
(find images in):
|
51
|
-
* img_src - <img src="url">
|
52
|
-
* a_href - <a href="url">
|
53
|
-
* style_url - <element style="(background|background-image): url('url')">
|
54
|
-
* link_icon - <link rel="shortcut icon" href="url">
|
71
|
+
Detailed location description
|
55
72
|
|
56
|
-
|
57
|
-
|
73
|
+
* img_src - tag: img, attribute: src="url"
|
74
|
+
* a_href - tag: a, attribute: href="url"
|
75
|
+
* style_url - tag: any, attribute: style="(background|background-image): url('url')"
|
76
|
+
* link_icon - tag: link, attribute: rel="shortcut icon" href="url"
|
77
|
+
|
78
|
+
For "download" method you can use following directives
|
79
|
+
|
80
|
+
:parallel => true # for multi thread downloading (this is default if no options)
|
81
|
+
:consequentially => true, # for sequential downloading into a single stream
|
82
|
+
:user_agent => "ruby" # Mozilla/5.0 by default
|
58
83
|
|
59
84
|
== Executables
|
60
85
|
You can simply use the executed shell commands:
|
@@ -68,6 +93,18 @@ For download favicon only
|
|
68
93
|
For download all, that is located in the places for pictures
|
69
94
|
download_images url dir/
|
70
95
|
|
96
|
+
For download by regexp
|
97
|
+
download_by_regexp url dir/ "[^'\"]+\\.js"
|
98
|
+
|
99
|
+
== Debugging
|
100
|
+
|
101
|
+
"-d", "--debug"
|
102
|
+
|
103
|
+
To monitor the process of downloading, use the -d flag in the parameters.
|
104
|
+
Perhaps there is an error URI::InvalidURIError in some cases.
|
105
|
+
|
106
|
+
download_images url dir/ -d
|
107
|
+
|
71
108
|
== Copyright
|
72
109
|
|
73
110
|
Copyright (c) 2011 Malykh Oleg. See LICENSE.txt for
|
data/lib/image_downloader.rb
CHANGED
@@ -26,6 +26,8 @@ module ImageDownloader
|
|
26
26
|
class Process
|
27
27
|
attr_accessor :argument, :images
|
28
28
|
|
29
|
+
DEFAULT_USER_AGENT = 'Mozilla/5.0'
|
30
|
+
|
29
31
|
def initialize(url, path)
|
30
32
|
@argument = Arguments.new(url, path)
|
31
33
|
@argument.check
|
@@ -34,16 +36,21 @@ module ImageDownloader
|
|
34
36
|
end
|
35
37
|
|
36
38
|
# :any_looks_like_image => true
|
39
|
+
# :regexp => /[^'"]+\.jpg/i
|
37
40
|
# :ignore_without => {:(extension|image_extension) => true}
|
38
41
|
# Nokogiri gem is required:
|
39
42
|
# :collect => {:all => true, :(img_src|a_href|style_url|link_icon) => true}
|
43
|
+
# :user_agent => 'Mozilla/5.0'
|
40
44
|
def parse(h={:collect => {}, :ignore_without => {}})
|
41
45
|
self.rebuild_collect_hash(h)
|
42
46
|
|
43
|
-
parser = Parser.new(self.argument.url)
|
47
|
+
parser = Parser.new(self.argument.url, h[:user_agent] || DEFAULT_USER_AGENT)
|
44
48
|
if h[:any_looks_like_image]
|
45
49
|
parser.get_content_raw
|
46
50
|
parser.get_images_raw(self.argument.path, h[:collect])
|
51
|
+
elsif h[:regexp]
|
52
|
+
parser.get_content_raw
|
53
|
+
parser.get_images_regexp(self.argument.path, h[:regexp])
|
47
54
|
else
|
48
55
|
parser.get_content
|
49
56
|
parser.get_images(self.argument.path, h[:collect])
|
@@ -55,16 +62,25 @@ module ImageDownloader
|
|
55
62
|
end
|
56
63
|
|
57
64
|
# :(parallel|consequentially)
|
65
|
+
# :(parallel|consequentially) => true
|
66
|
+
# :user_agent => 'Mozilla/5.0'
|
58
67
|
def download(*args)
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
68
|
+
user_agent = args_hash_and_contain(args, :user_agent) || DEFAULT_USER_AGENT
|
69
|
+
if !args.first || args.first == :parallel || args_hash_and_contain(args, :parallel)
|
70
|
+
Download.parallel(self.images, user_agent)
|
71
|
+
elsif args.first == :consequentially || args_hash_and_contain(args, :consequentially)
|
72
|
+
Download.consequentially(self.images, user_agent)
|
73
|
+
else
|
74
|
+
p "Not correct argument for download method"
|
63
75
|
end
|
64
76
|
end
|
65
77
|
|
66
78
|
protected
|
67
79
|
|
80
|
+
def args_hash_and_contain(args, sym)
|
81
|
+
((args.first.class.to_s == "Hash") && !args.first.empty? && (args.first[sym]))
|
82
|
+
end
|
83
|
+
|
68
84
|
def rebuild_collect_hash(h={})
|
69
85
|
if !h[:collect] || h[:collect].empty? || h[:collect][:all]
|
70
86
|
h[:collect] = Parser.all_image_places
|
@@ -1,21 +1,21 @@
|
|
1
1
|
module ImageDownloader
|
2
2
|
class Download
|
3
3
|
|
4
|
-
def self.parallel(images)
|
4
|
+
def self.parallel(images, user_agent)
|
5
5
|
threads = []
|
6
6
|
for image in images
|
7
7
|
threads << Thread.new(image) {|local_image|
|
8
8
|
p "upload from url #{local_image.absolute_src} to file #{local_image.file_name}" if $debug_option
|
9
|
-
local_image.download
|
9
|
+
local_image.download(user_agent)
|
10
10
|
}
|
11
11
|
end
|
12
12
|
threads.each { |aThread| aThread.join }
|
13
13
|
end
|
14
14
|
|
15
|
-
def self.consequentially(images)
|
15
|
+
def self.consequentially(images, user_agent)
|
16
16
|
for image in images
|
17
17
|
p "upload from url #{image.absolute_src} to file #{image.file_name}" if $debug_option
|
18
|
-
image.download
|
18
|
+
image.download(user_agent)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -22,7 +22,7 @@ module ImageDownloader
|
|
22
22
|
@absolute_src = ((@src =~ /http/) ? @src : ('http://' + page_host + '/' + @src))
|
23
23
|
end
|
24
24
|
|
25
|
-
def download
|
25
|
+
def download(user_agent)
|
26
26
|
url = URI.parse(self.absolute_src)
|
27
27
|
request = Net::HTTP::Get.new(url.path)
|
28
28
|
Net::HTTP.start(url.host) {|http|
|
@@ -32,15 +32,17 @@ module ImageDownloader
|
|
32
32
|
# - mechanize (main web client), slow
|
33
33
|
# - wget, quick, but cannot support some ability (403, 404 responses)
|
34
34
|
# - sockets, independent request, quick, but low-level (many lines of code)
|
35
|
-
self.download_by_segment(http,request)
|
36
|
-
# self.download_simple(http,request)
|
35
|
+
self.download_by_segment(http,request,user_agent)
|
36
|
+
# self.download_simple(http,request,user_agent)
|
37
37
|
}
|
38
|
+
rescue URI::InvalidURIError
|
39
|
+
p "Error: bad URI: #{self.absolute_src}" if $debug_option
|
38
40
|
end
|
39
41
|
|
40
|
-
def download_by_segment(http,request)
|
42
|
+
def download_by_segment(http,request,user_agent)
|
41
43
|
file = open(self.file_path_name, "wb")
|
42
44
|
begin
|
43
|
-
http.request_get(request.path, "User-Agent"=>
|
45
|
+
http.request_get(request.path, "User-Agent"=> user_agent) do |response|
|
44
46
|
response.read_body do |segment|
|
45
47
|
file.write(segment)
|
46
48
|
end
|
@@ -50,8 +52,8 @@ module ImageDownloader
|
|
50
52
|
end
|
51
53
|
end
|
52
54
|
|
53
|
-
def download_simple(http,request)
|
54
|
-
response = http.get(request.path, "User-Agent"=>
|
55
|
+
def download_simple(http,request,user_agent)
|
56
|
+
response = http.get(request.path, "User-Agent"=> user_agent)
|
55
57
|
open(self.file_path_name, "wb") { |file|
|
56
58
|
file.write(response.body)
|
57
59
|
}
|
@@ -14,22 +14,23 @@ end
|
|
14
14
|
|
15
15
|
module ImageDownloader
|
16
16
|
class Parser
|
17
|
-
attr_accessor :url, :argument_url, :content, :images, :images_hash
|
17
|
+
attr_accessor :url, :argument_url, :content, :images, :images_hash, :user_agent
|
18
18
|
|
19
19
|
A_HREF_IMAGE_PREFIX = '_a_href_'
|
20
20
|
STYLE_URL_IMAGE_PREFIX = '_style_url_'
|
21
21
|
LINK_ICON_IMAGE_PREFIX = '_link_icon_'
|
22
22
|
COLLECT_METHODS_PREFIX = 'collect_from_'
|
23
23
|
|
24
|
-
def initialize(url)
|
24
|
+
def initialize(url, user_agent)
|
25
25
|
@argument_url = url
|
26
|
+
@user_agent = user_agent
|
26
27
|
@url = URI.parse(url)
|
27
28
|
@images = []
|
28
29
|
@images_hash = {}
|
29
30
|
end
|
30
31
|
|
31
32
|
def get_content_raw
|
32
|
-
@content = open(self.argument_url).read
|
33
|
+
@content = open(self.argument_url, 'User-Agent' => self.user_agent).read
|
33
34
|
@content.gsub!(/[\n\r\t]+/,' ')
|
34
35
|
end
|
35
36
|
|
@@ -40,8 +41,12 @@ module ImageDownloader
|
|
40
41
|
}
|
41
42
|
end
|
42
43
|
|
44
|
+
def get_images_regexp(path,regexp)
|
45
|
+
self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
|
46
|
+
end
|
47
|
+
|
43
48
|
def get_content
|
44
|
-
@content = Nokogiri::HTML(open(self.argument_url))
|
49
|
+
@content = Nokogiri::HTML(open(self.argument_url, 'User-Agent' => self.user_agent))
|
45
50
|
end
|
46
51
|
|
47
52
|
def get_images(path,h={})
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: image_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Malykh Oleg
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-07-
|
18
|
+
date: 2011-07-19 00:00:00 +04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -100,6 +100,7 @@ executables:
|
|
100
100
|
- download_any_images
|
101
101
|
- download_images
|
102
102
|
- download_icon
|
103
|
+
- download_by_regexp
|
103
104
|
extensions: []
|
104
105
|
|
105
106
|
extra_rdoc_files:
|
@@ -117,6 +118,7 @@ files:
|
|
117
118
|
- bin/download_any_images
|
118
119
|
- bin/download_images
|
119
120
|
- bin/download_icon
|
121
|
+
- bin/download_by_regexp
|
120
122
|
has_rdoc: true
|
121
123
|
homepage: http://github.com/Fotom/image_downloader
|
122
124
|
licenses:
|