static_image_download 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/bin/grab.rb +53 -0
- data/lib/static_image_download.rb +80 -0
- data/lib/static_image_download/images.rb +184 -0
- data/lib/static_image_download/parser.rb +127 -0
- metadata +150 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Wiseland
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= static_image_download
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Contributing to static_image_download
|
6
|
+
|
7
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
8
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
9
|
+
* Fork the project.
|
10
|
+
* Start a feature/bugfix branch.
|
11
|
+
* Commit and push until you are happy with your contribution.
|
12
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2012 Wiseland. See LICENSE.txt for
|
18
|
+
further details.
|
19
|
+
|
data/bin/grab.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
|
3
|
+
# Created by Alex Lapin
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'benchmark'
|
8
|
+
#require 'ruby-debug'
|
9
|
+
|
10
|
+
d_url = ARGV[0].nil? ? "http://edition.cnn.com/" : ARGV[0].to_s
|
11
|
+
p "download url= #{d_url}"
|
12
|
+
unless d_url.match(/^(http|https|ftp)\:\/\//i)
|
13
|
+
p "URL is invalid. It must start with 'http://'"
|
14
|
+
exit
|
15
|
+
end
|
16
|
+
d_dir = ARGV[1].nil? ? Dir::pwd + "/" + "#{URI.parse(d_url).host}_img" : Dir::pwd + "/" + ARGV[1].to_s
|
17
|
+
p "download dir= #{d_dir}"
|
18
|
+
|
19
|
+
|
20
|
+
# To see more information set $debug_option to true
|
21
|
+
$debug_option = false
|
22
|
+
|
23
|
+
image_downloader = ""
|
24
|
+
Benchmark.bm do |x|
|
25
|
+
begin
|
26
|
+
# If you don't have curb, nokogiri, hpricot gems installed in the system, they will be loading automatically from dir './libs'
|
27
|
+
# But it'll take some time to load libraries/ Please wait...
|
28
|
+
x.report("Loading libraries:") do
|
29
|
+
#require File.dirname(__FILE__) + '/static_image_download.rb'
|
30
|
+
require 'static_image_download'
|
31
|
+
include StaticImageDownloader
|
32
|
+
end
|
33
|
+
|
34
|
+
x.report("Initialize:") { image_downloader = Downloader.new(d_url, d_dir) }
|
35
|
+
|
36
|
+
# You can use 'URI_EXTRACT' (default) or 'NOKOGIRI' or 'HPRICOT' options to parse the picture links
|
37
|
+
# An exapmle: image_downloader.parse_images(parse_option='URI_EXTRACT', parse_timeout=10, user_agent='Mozilla/5.0')
|
38
|
+
# These params are used by default. So you can call image_downloader.parse_images without params
|
39
|
+
x.report("Pasre links:") { image_downloader.parse_images('URI_EXTRACT', 10, 'Mozilla/5.0') }
|
40
|
+
|
41
|
+
# You can use 'CURB_EASY' (default - fastest) or 'HTTP_GET' options to get the pictures
|
42
|
+
# An example: image_downloader.parallel_download(download_option='CURB_EASY', download_timeout=120, allow_dup_files=true)
|
43
|
+
# Set allow_dup_files=true if you want file duplicates: file1, file2, ... etc, otherwise set allow_dup_files=false
|
44
|
+
# These params are used by default. So you can call image_downloader.parallel_download without params
|
45
|
+
x.report("Parallel download pictures:") { image_downloader.parallel_download('CURB_EASY', 120, true) }
|
46
|
+
|
47
|
+
# Consequential download is slower than Parallel download
|
48
|
+
# Uncomment code below if you want to use consequential download pictures
|
49
|
+
#x.report("Consequential download pictures:") { image_downloader.consequential_download }
|
50
|
+
rescue
|
51
|
+
p "Error downloading images!"
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#
|
2
|
+
#Created by Alex Lapin
|
3
|
+
# core libs
|
4
|
+
require 'timeout'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'net/http'
|
7
|
+
|
8
|
+
# extern libs
|
9
|
+
require File.dirname(__FILE__) + '/static_image_download/images.rb'
|
10
|
+
require File.dirname(__FILE__) + '/static_image_download/parser.rb'
|
11
|
+
begin
|
12
|
+
require 'curb'
|
13
|
+
rescue LoadError => e
|
14
|
+
p "No curb installed"
|
15
|
+
end
|
16
|
+
|
17
|
+
begin
|
18
|
+
require 'hpricot'
|
19
|
+
rescue LoadError => e
|
20
|
+
p "No hpricot installed"
|
21
|
+
end
|
22
|
+
|
23
|
+
begin
|
24
|
+
require 'nokogiri'
|
25
|
+
rescue LoadError => e
|
26
|
+
p "No nokogiri installed"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Comment below 2 libraries to reduce total library loading time if you dont need them
|
30
|
+
# If you have them installed in your system just uncomment above 2 libraries
|
31
|
+
|
32
|
+
module StaticImageDownloader
|
33
|
+
|
34
|
+
class Downloader
|
35
|
+
|
36
|
+
attr_accessor :url, :path, :images
|
37
|
+
|
38
|
+
@@DEFAULTPATH = 'images' # Default path for images
|
39
|
+
|
40
|
+
# This is just for info:
|
41
|
+
# user_agent - Parse::DEFAULTUSERAGENT = 'Mozilla/5.0'
|
42
|
+
# parse_timeout - Parse::DEFAULTTIMEOUT = 10
|
43
|
+
# parse_option - Parse::DEFAULTPARSEOPTION = 'URI_EXTRACT' # also you can use one 'NOKOGIRI' or 'HPRICOT'
|
44
|
+
# download_timeout - Image::DEFAULTTIMEOUT = 120
|
45
|
+
# Image::DEFAULTDONWLOADOPTION = 'CURB_EASY' # also you can use 'HTTP_GET'
|
46
|
+
# allow_dup_files - dup_file_names = false # don't get file if exists one
|
47
|
+
# allow_dup_files - dup_file_names [DEFAULT option]= true # get file if it exists as new one and add numerical prefix (1,2,3, etc.) to it's name
|
48
|
+
|
49
|
+
def initialize(url, path=@@DEFAULTPATH)
|
50
|
+
@url = url
|
51
|
+
@path = path.nil? ? @@DEFAULTPATH : path
|
52
|
+
@images = []
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.default_path
|
56
|
+
@@DEFAULTPATH
|
57
|
+
end
|
58
|
+
|
59
|
+
def parse_images(parse_option='URI_EXTRACT', parse_timeout=10, user_agent='Mozilla/5.0')
|
60
|
+
parser = Parser.new(self.url, self.path, parse_option, parse_timeout, user_agent)
|
61
|
+
parser.get_content_raw
|
62
|
+
parser.parse_images
|
63
|
+
self.images = parser.images
|
64
|
+
end
|
65
|
+
|
66
|
+
def parallel_download(download_option='CURB_EASY', download_timeout=120, allow_dup_files=true)
|
67
|
+
threads = []
|
68
|
+
self.images.each do |img|
|
69
|
+
threads << Thread.new(img) { |image| image.download(download_option, download_timeout, :dup_file_names => allow_dup_files) }
|
70
|
+
end
|
71
|
+
threads.each { |aThread| aThread.join }
|
72
|
+
p "Total " + Images::get_successfull_pictures_number + " pictures were got"
|
73
|
+
end
|
74
|
+
|
75
|
+
def consequential_download(download_option='CURB_EASY', download_timeout=120, allow_dup_files=true)
|
76
|
+
self.images.each { |img| img.download(download_option, download_timeout, :dup_file_names => allow_dup_files) }
|
77
|
+
p "Total " + Images::get_successfull_pictures_number + " pictures were got"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
#
|
2
|
+
#Created by Alex Lapin
|
3
|
+
|
4
|
+
module StaticImageDownloader
|
5
|
+
class Images
|
6
|
+
attr_accessor :src, :file_base_name, :file_path_name, :full_path_name, :page_host, :absolute_src
|
7
|
+
|
8
|
+
MAX_FILE_NAME_LENGTH = 100
|
9
|
+
IMAGE_EXT = ["jpg", "jpeg", "png", "gif", "ico", "svg", "bmp"]
|
10
|
+
EMPTY_FILE_NAME = 'EMPTY_'
|
11
|
+
|
12
|
+
DOWNLOAD_OPTIONS = {
|
13
|
+
'CURB_EASY' => :curb_simple,
|
14
|
+
'HTTP_GET' => :http_get
|
15
|
+
}
|
16
|
+
DOWNLOAD_OPTIONS.default = :curb_simple
|
17
|
+
|
18
|
+
@@DEFAULTDONWLOADOPTION = 'CURB_EASY'
|
19
|
+
@@HTTPONSUCCESS = Regexp.new(/^20\d$/)
|
20
|
+
@@DEFAULTPATH = "./"
|
21
|
+
@@DEFAULTTIMEOUT = 120
|
22
|
+
@@SUCCESSFULLPICTURES = 0
|
23
|
+
|
24
|
+
def initialize(src, file_path_name=@@DEFAULTPATH, download_option=@@DEFAULTDONWLOADOPTION, page_host="")
|
25
|
+
@src = src
|
26
|
+
@page_host = page_host # Reserved for future
|
27
|
+
@download_option = download_option.nil? ? @@DEFAULTDONWLOADOPTION : download_option
|
28
|
+
@file_path_name = file_path_name.nil? ? @@DEFAULTPATH : file_path_name.gsub(/\/+$/,'')
|
29
|
+
|
30
|
+
file_base_name = @src.sub(/.*\//,'')
|
31
|
+
file_base_name = EMPTY_FILE_NAME + rand(1000).to_s if !file_base_name || file_base_name.empty?
|
32
|
+
if file_base_name.size > MAX_FILE_NAME_LENGTH
|
33
|
+
file_base_name = file_base_name[-MAX_FILE_NAME_LENGTH..file_base_name.size]
|
34
|
+
end
|
35
|
+
|
36
|
+
@file_base_name = file_base_name
|
37
|
+
@file_full_name = File.expand_path(File.join(@file_path_name, @file_base_name))
|
38
|
+
|
39
|
+
@full_path_name = File.expand_path(File.join(@file_path_name))
|
40
|
+
Dir::mkdir(@full_path_name) unless FileTest.directory?(@full_path_name)
|
41
|
+
end
|
42
|
+
|
43
|
+
class << self
|
44
|
+
def default_download_option
|
45
|
+
@@DEFAULTDONWLOADOPTION
|
46
|
+
end
|
47
|
+
|
48
|
+
def default_path
|
49
|
+
@@DEFAULTPATH
|
50
|
+
end
|
51
|
+
|
52
|
+
def default_timeout
|
53
|
+
@@DEFAULTTIMEOUT
|
54
|
+
end
|
55
|
+
|
56
|
+
def get_successfull_pictures_number
|
57
|
+
@@SUCCESSFULLPICTURES.to_s
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
def inc_successfull_pictures_number
|
62
|
+
@@SUCCESSFULLPICTURES += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def download(download_option=@@DEFAULTDONWLOADOPTION, timeout=@@DEFAULTTIMEOUT, h={:dup_file_names => true})
|
67
|
+
#p "download_option=#{download_option}"
|
68
|
+
begin
|
69
|
+
response = nil
|
70
|
+
status = Timeout::timeout(timeout) {
|
71
|
+
h[:start_time] = Time.now
|
72
|
+
response = method_to_value(download_option, h)
|
73
|
+
}
|
74
|
+
rescue => error
|
75
|
+
p "#{error}"
|
76
|
+
nil
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def option_to_method(option)
|
81
|
+
opt = DOWNLOAD_OPTIONS[option]
|
82
|
+
end
|
83
|
+
|
84
|
+
def method_to_value(option, h={})
|
85
|
+
#p "option= #{option}"
|
86
|
+
method = option_to_method(option)
|
87
|
+
p "method= #{method}" if $debug_option
|
88
|
+
begin
|
89
|
+
response = send(method, h) || ""
|
90
|
+
@@SUCCESSFULLPICTURES += 1 if response[:path]
|
91
|
+
return response
|
92
|
+
rescue => error
|
93
|
+
p "method_to_value.error = #{error}"
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def print_download_log(rcode, file_full_name, h={})
|
101
|
+
if @@HTTPONSUCCESS !~ rcode
|
102
|
+
p "Error: html_res_code=" + rcode + " " + (Time.now - h[:start_time]).to_s + " sec. for #{File.basename(file_full_name)} File could not be saved!"
|
103
|
+
else
|
104
|
+
p "html_res_code=" + rcode + " " + (Time.now - h[:start_time]).to_s + " sec. for #{File.basename(file_full_name)}" if $debug_option
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def check_file_name(src, h={})
|
109
|
+
result = {}
|
110
|
+
response = {}
|
111
|
+
file_full_name = @file_full_name
|
112
|
+
fname_counter = 1
|
113
|
+
if File.exist?(file_full_name) and !h[:dup_file_names]
|
114
|
+
response[:error] = "Error downloading. File #{file_full_name} already exists"
|
115
|
+
p response[:error]
|
116
|
+
p " src= #{src}" if $debug_option
|
117
|
+
result[:response] = response
|
118
|
+
#return result
|
119
|
+
else
|
120
|
+
while File.exist?(file_full_name)
|
121
|
+
fname_counter += 1;
|
122
|
+
file_full_name = File.dirname(@file_full_name) + '/' + File.basename(@file_full_name, '.*') + '_' + fname_counter.to_s + File.extname(@file_full_name)
|
123
|
+
end
|
124
|
+
result[:file_full_name] = file_full_name
|
125
|
+
#File.new(file_full_name, "wb").close
|
126
|
+
end
|
127
|
+
return result
|
128
|
+
end
|
129
|
+
|
130
|
+
def curb_simple(h={})
|
131
|
+
response = {}
|
132
|
+
src = @src
|
133
|
+
result = check_file_name(src, h)
|
134
|
+
response = result[:response] if result[:response]
|
135
|
+
return response if response[:error]
|
136
|
+
|
137
|
+
file_full_name = result[:file_full_name]
|
138
|
+
begin
|
139
|
+
curl = Curl::Easy.download(src, file_full_name)
|
140
|
+
rcode = curl.response_code.to_s
|
141
|
+
#p "response_code=" + rcode if $debug_option
|
142
|
+
unless @@HTTPONSUCCESS =~ rcode
|
143
|
+
File.delete(file_full_name) if File.exist?(file_full_name)
|
144
|
+
end
|
145
|
+
print_download_log(rcode, file_full_name, h)
|
146
|
+
rpath = file_full_name if File.exist?(file_full_name)
|
147
|
+
rescue => error
|
148
|
+
response[:error] = error.message
|
149
|
+
File.delete(file_full_name) if File.exist?(file_full_name)
|
150
|
+
end
|
151
|
+
|
152
|
+
response[:response_code] = rcode
|
153
|
+
response[:path] = rpath
|
154
|
+
return response
|
155
|
+
end
|
156
|
+
|
157
|
+
def http_get(h={})
|
158
|
+
response = {}
|
159
|
+
src = @src
|
160
|
+
result = check_file_name(src, h)
|
161
|
+
response = result[:response] if result[:response]
|
162
|
+
return response if response[:error]
|
163
|
+
|
164
|
+
file_full_name = result[:file_full_name]
|
165
|
+
begin
|
166
|
+
answer = Net::HTTP.get_response(URI.parse(src))
|
167
|
+
rcode = answer.code
|
168
|
+
if @@HTTPONSUCCESS =~ rcode
|
169
|
+
open(file_full_name, "wb") { |file| file.write(answer.body) }
|
170
|
+
end
|
171
|
+
#p "response_code=" + answer.code if $debug_option
|
172
|
+
print_download_log(rcode, file_full_name, h)
|
173
|
+
rpath = file_full_name if File.exist?(file_full_name)
|
174
|
+
rescue => error
|
175
|
+
response[:error] = error.message
|
176
|
+
File.delete(file_full_name) if File.exist?(file_full_name)
|
177
|
+
end
|
178
|
+
|
179
|
+
response[:response_code] = rcode
|
180
|
+
response[:path] = rpath
|
181
|
+
return response
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#
|
2
|
+
#Created by Alex Lapin
|
3
|
+
|
4
|
+
module StaticImageDownloader
|
5
|
+
class Parser
|
6
|
+
attr_accessor :url, :content, :parse_option, :user_agent, :images, :extracted_links
|
7
|
+
|
8
|
+
PARSER_OPTIONS = {
|
9
|
+
'URI_EXTRACT' => :img_parse_uri_extract,
|
10
|
+
'NOKOGIRI' => :img_parse_nokogiri,
|
11
|
+
'HPRICOT' => :img_parse_hpricot
|
12
|
+
}
|
13
|
+
PARSER_OPTIONS.default = :img_parse_uri_extract
|
14
|
+
|
15
|
+
@@DEFAULTPARSEOPTION = 'URI_EXTRACT' #also you can use one 'NOKOGIRI' or 'HPRICOT'
|
16
|
+
@@DEFAULTUSERAGENT = 'Mozilla/5.0'
|
17
|
+
@@DEFAULTPATH = "./"
|
18
|
+
@@DEFAULTSITE = 'http://feed.informer.com'
|
19
|
+
@@DEFAULTTIMEOUT = 15
|
20
|
+
|
21
|
+
def initialize(url=@@DEFAULTSITE, path=@@DEFAULTPATH, parse_option=@@DEFAULTPARSEOPTION, timeout=@@DEFAULTTIMEOUT, user_agent=@@DEFAULTUSERAGENT, h={})
|
22
|
+
@url = url.nil? ? @@DEFAULTSITE : url
|
23
|
+
@user_agent = user_agent.nil? ? @@DEFAULTUSERAGENT : user_agent
|
24
|
+
@path = path.nil? ? @@DEFAULTPATH : path
|
25
|
+
@timeout = timeout.nil? ? @@DEFAULTTIMEOUT : timeout
|
26
|
+
@parse_option = parse_option.nil? ? @@DEFAULTPARSEOPTION : parse_option
|
27
|
+
@images = []
|
28
|
+
@extracted_links = []
|
29
|
+
@rgxp_img_uri = Regexp.new(/^(http|https|ftp)\:\/\/([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
|
30
|
+
#@rgxp_img_uri = Regexp.new(/^(((http|https|ftp)\:\/\/)|www|(\/\/))([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
|
31
|
+
@domain = URI.parse(url).host
|
32
|
+
@content = nil
|
33
|
+
end
|
34
|
+
|
35
|
+
class << self
|
36
|
+
def default_parse_option
|
37
|
+
@@DEFAULTPARSEOPTION
|
38
|
+
end
|
39
|
+
|
40
|
+
def default_user_agent
|
41
|
+
@@DEFAULTUSERAGENT
|
42
|
+
end
|
43
|
+
|
44
|
+
def default_path
|
45
|
+
@@DEFAULTPATH
|
46
|
+
end
|
47
|
+
|
48
|
+
def default_timeout
|
49
|
+
@@DEFAULTTIMEOUT
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def option_to_method(option)
|
54
|
+
opt = PARSER_OPTIONS[option]
|
55
|
+
end
|
56
|
+
|
57
|
+
def method_to_value(option, h={})
|
58
|
+
method = option_to_method(option)
|
59
|
+
p "method= #{method}" if $debug_option
|
60
|
+
begin
|
61
|
+
response = send(method, h) || ""
|
62
|
+
return response
|
63
|
+
rescue => error
|
64
|
+
p "method_to_value.error = #{error}"
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_content_raw
|
70
|
+
@content = self.get_url.read
|
71
|
+
@content.gsub!(/[\n\r\t]+/,' ')
|
72
|
+
#p @content if $debug_option
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_url
|
76
|
+
open(self.url, 'User-Agent' => self.user_agent)
|
77
|
+
end
|
78
|
+
|
79
|
+
def img_parse_nokogiri(h={})
|
80
|
+
doc = Nokogiri::HTML(@content)
|
81
|
+
get_extracted_links(doc.search("//img"))
|
82
|
+
end
|
83
|
+
|
84
|
+
def img_parse_hpricot(h={})
|
85
|
+
doc = Hpricot(@content)
|
86
|
+
get_extracted_links(doc.search("//img"))
|
87
|
+
end
|
88
|
+
|
89
|
+
def img_parse_uri_extract(h={})
|
90
|
+
get_extracted_links(URI.extract(@content).select{ |l| l[/#{@rgxp_img_uri}/] })
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_extracted_links(links)
|
94
|
+
return false unless links
|
95
|
+
links.each do |link|
|
96
|
+
p "link= #{link}" if $debug_option
|
97
|
+
link = link[:src].to_s unless link.is_a?(String)
|
98
|
+
@extracted_links << link.match(@rgxp_img_uri)[0] if link.match(@rgxp_img_uri) and !@extracted_links.include?(link.match(@rgxp_img_uri)[0])
|
99
|
+
end
|
100
|
+
#p "extracted_links= #{@extracted_links}" if $debug_option
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_images(h={})
|
104
|
+
begin
|
105
|
+
response = nil
|
106
|
+
status = Timeout::timeout(@timeout) {
|
107
|
+
response = method_to_value(self.parse_option, h)
|
108
|
+
collect_images
|
109
|
+
}
|
110
|
+
rescue => error
|
111
|
+
p "#{error}"
|
112
|
+
nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def collect_images
|
117
|
+
@extracted_links.each do |link|
|
118
|
+
self.push_image(link)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def push_image(src)
|
123
|
+
self.images.push Images.new(src, @path, Images.default_download_option)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: static_image_download
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alex Lapin
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-07-05 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.8.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: rdoc
|
28
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "3.12"
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *id002
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: bundler
|
39
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 1.0.0
|
45
|
+
type: :development
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: *id003
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: jeweler
|
50
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ~>
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.8.4
|
56
|
+
type: :development
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *id004
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: simplecov
|
61
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ~>
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0.5"
|
67
|
+
type: :development
|
68
|
+
prerelease: false
|
69
|
+
version_requirements: *id005
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: nokogiri
|
72
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.5.3
|
78
|
+
type: :development
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: *id006
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: curb
|
83
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ~>
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: 0.8.0
|
89
|
+
type: :development
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: *id007
|
92
|
+
- !ruby/object:Gem::Dependency
|
93
|
+
name: hpricot
|
94
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
95
|
+
none: false
|
96
|
+
requirements:
|
97
|
+
- - ~>
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.8.6
|
100
|
+
type: :development
|
101
|
+
prerelease: false
|
102
|
+
version_requirements: *id008
|
103
|
+
description: Get all pictures from URL gem
|
104
|
+
email: wiseland@tut.by
|
105
|
+
executables:
|
106
|
+
- grab.rb
|
107
|
+
extensions: []
|
108
|
+
|
109
|
+
extra_rdoc_files:
|
110
|
+
- LICENSE.txt
|
111
|
+
- README.rdoc
|
112
|
+
files:
|
113
|
+
- lib/static_image_download.rb
|
114
|
+
- lib/static_image_download/images.rb
|
115
|
+
- lib/static_image_download/parser.rb
|
116
|
+
- LICENSE.txt
|
117
|
+
- README.rdoc
|
118
|
+
- bin/grab.rb
|
119
|
+
homepage: http://github.com/wiseland/static_image_download
|
120
|
+
licenses:
|
121
|
+
- MIT
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
|
125
|
+
require_paths:
|
126
|
+
- lib
|
127
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
hash: 1791629116597004747
|
133
|
+
segments:
|
134
|
+
- 0
|
135
|
+
version: "0"
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ">="
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: "0"
|
142
|
+
requirements: []
|
143
|
+
|
144
|
+
rubyforge_project:
|
145
|
+
rubygems_version: 1.8.24
|
146
|
+
signing_key:
|
147
|
+
specification_version: 3
|
148
|
+
summary: Get all pictures from URL
|
149
|
+
test_files: []
|
150
|
+
|