get_them_all 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -0
- data/README.md +77 -0
- data/bin/gta +54 -0
- data/get_them_all.gemspec +28 -0
- data/lib/get_them_all.rb +47 -0
- data/lib/get_them_all/action.rb +55 -0
- data/lib/get_them_all/actions/download_action.rb +80 -0
- data/lib/get_them_all/actions/examine_action.rb +47 -0
- data/lib/get_them_all/extension.rb +17 -0
- data/lib/get_them_all/extensions/action_logger.rb +58 -0
- data/lib/get_them_all/extensions/gauge_display.rb +155 -0
- data/lib/get_them_all/extensions/graph_builder.rb +138 -0
- data/lib/get_them_all/history.rb +23 -0
- data/lib/get_them_all/javascript_loader.rb +74 -0
- data/lib/get_them_all/logger.rb +35 -0
- data/lib/get_them_all/notifier.rb +7 -0
- data/lib/get_them_all/site_downloader.rb +332 -0
- data/lib/get_them_all/storage.rb +27 -0
- data/lib/get_them_all/storage/dropbox_storage.rb +114 -0
- data/lib/get_them_all/storage/file_storage.rb +40 -0
- data/lib/get_them_all/version.rb +3 -0
- data/lib/get_them_all/worker.rb +48 -0
- metadata +172 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009-2011 Julien Ammous
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
= What is it ?
|
2
|
+
|
3
|
+
Get Them All is my personal try at building a versatile and powerful web downloader, its goal is pretty simple:
|
4
|
+
download all the targets and keep up to date with new content by remembering what was downloaded.
|
5
|
+
|
6
|
+
It should be able to download ay file type and try as much as possible to not make any assumptions on how the
|
7
|
+
targeted website is built.
|
8
|
+
|
9
|
+
EventMachine is used to power the core, hpricot is used to parse the html.
|
10
|
+
|
11
|
+
# Why ?
|
12
|
+
|
13
|
+
I simply never found any tool fulfilling my needs so I made mine ;)
|
14
|
+
|
15
|
+
|
16
|
+
# What can it do for you
|
17
|
+
|
18
|
+
First let's start by what is currently supported:
|
19
|
+
|
20
|
+
- authentication (partially by hand)
|
21
|
+
- the referer is passed from one page to another so any leecher detection
|
22
|
+
by referer will fail
|
23
|
+
- cookies are passed too
|
24
|
+
- parallel download, you decide how many parallel tasks are executed
|
25
|
+
you can go as high as you want but don't be stupid !
|
26
|
+
- multiple storage backend, currently the files can be saved in:
|
27
|
+
- local disk
|
28
|
+
- dropbox
|
29
|
+
- javascript parsing with therubyracer, yes you read that well,
|
30
|
+
if you are crawling a javascript powered site and need to read javascript
|
31
|
+
you can use this to extract the informations you need.
|
32
|
+
|
33
|
+
Any website is considered as a reversed pyramid, let's take a gallery website as an example:
|
34
|
+
|
35
|
+
- the first level would be the page containing all the thumbnails
|
36
|
+
- the second level would be a page showing the picture (each link collected in level 0
|
37
|
+
will lead to a different page on level 2)
|
38
|
+
- the third level would be the link to the picture itself
|
39
|
+
|
40
|
+
I decided on this model after some testing and until now I never found a
|
41
|
+
website where this cannot be applied (a website with fiels to download)
|
42
|
+
|
43
|
+
|
44
|
+
# Current state
|
45
|
+
|
46
|
+
The application is already ready for my needs and may be for someone else.
|
47
|
+
Currently all the connections errors may not be correctly handled especially if
|
48
|
+
the web server really has trouble keeping connections alive to serve the clients
|
49
|
+
(like for the example above).
|
50
|
+
|
51
|
+
|
52
|
+
# Usage
|
53
|
+
|
54
|
+
Look at the examples folder, there is two way of using this gem:
|
55
|
+
|
56
|
+
As an application, try running:
|
57
|
+
|
58
|
+
```bash
|
59
|
+
./bin/gta exec examples/wallpaper -s data
|
60
|
+
```
|
61
|
+
|
62
|
+
Or as a library, try this:
|
63
|
+
|
64
|
+
```bash
|
65
|
+
ruby examples/standalone.rb
|
66
|
+
```
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
# Disclaimer
|
71
|
+
|
72
|
+
As with most open source projects you are responsible for your actions, if you start
|
73
|
+
a crawler with a lot of parallel tasks and manage to get banned for your favorite
|
74
|
+
wallpaper site I have nothing to do with this ok ?
|
75
|
+
Don't be stupid and everything will be fine, for my needs I rarely need more than
|
76
|
+
2 examiners and 1/2 downloaders.
|
77
|
+
|
data/bin/gta
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "rubygems"
|
3
|
+
|
4
|
+
$LOAD_PATH.unshift( File.expand_path('../../lib', __FILE__) )
|
5
|
+
require "get_them_all"
|
6
|
+
require "thor"
|
7
|
+
|
8
|
+
|
9
|
+
class GtaRunner < Thor
|
10
|
+
|
11
|
+
desc "exec [-s <path>] <script_path>", "run a user script"
|
12
|
+
method_option :storage_path, :aliases => '-s', :desc => "path where the data will be saved"
|
13
|
+
def exec(script_path)
|
14
|
+
|
15
|
+
storage_path = options[:storage_path]
|
16
|
+
raise("storage_path required") unless storage_path
|
17
|
+
|
18
|
+
if storage_path[0,1] != '/'
|
19
|
+
# relative path
|
20
|
+
storage_path = File.join(Dir.pwd, storage_path)
|
21
|
+
end
|
22
|
+
|
23
|
+
if script_path[0,1] != '/'
|
24
|
+
script_path = File.join(Dir.pwd, script_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
# the file exist, load it
|
28
|
+
require script_path
|
29
|
+
|
30
|
+
# check that the class exist
|
31
|
+
class_name = File.basename(script_path, ".rb").camelize + "Downloader"
|
32
|
+
fail("file #{script_path} should define class #{class_name} !") unless Object.const_defined?( class_name.to_sym )
|
33
|
+
|
34
|
+
info("Started with config file #{File.basename(script_path)}")
|
35
|
+
|
36
|
+
# CTRL+C
|
37
|
+
trap("INT") do
|
38
|
+
EM::stop_event_loop()
|
39
|
+
end
|
40
|
+
|
41
|
+
# create the instance (and start download)
|
42
|
+
class_name.constantize.new(
|
43
|
+
:storage => {
|
44
|
+
:type => 'file',
|
45
|
+
:params => {
|
46
|
+
:root => storage_path
|
47
|
+
}
|
48
|
+
},
|
49
|
+
:extensions => [GetThemAll::ActionLogger.new]
|
50
|
+
).start()
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
GtaRunner.start
|
@@ -0,0 +1,28 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require "get_them_all/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "get_them_all"
|
6
|
+
s.version = GetThemAll::VERSION
|
7
|
+
s.authors = ["Julien Ammous"]
|
8
|
+
s.email = []
|
9
|
+
s.homepage = ""
|
10
|
+
s.summary = %q{Mass downloader}
|
11
|
+
s.description = %q{Mass downloader useable as standalone or as a library}
|
12
|
+
|
13
|
+
s.rubyforge_project = "get_them_all"
|
14
|
+
|
15
|
+
s.files = `git ls-files lib/* *.gemspec README.* LICENSE`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency 'thor'
|
20
|
+
s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
|
21
|
+
s.add_runtime_dependency 'em-priority-queue', '~> 0.0.2'
|
22
|
+
s.add_runtime_dependency 'hpricot', '~> 0.8.1'
|
23
|
+
s.add_runtime_dependency 'i18n'
|
24
|
+
s.add_runtime_dependency 'activesupport', '~> 3.1.0'
|
25
|
+
s.add_runtime_dependency 'therubyracer', '~> 0.9.8'
|
26
|
+
s.add_runtime_dependency 'dropbox'
|
27
|
+
s.add_runtime_dependency 'girl_friday'
|
28
|
+
end
|
data/lib/get_them_all.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require File.expand_path('../get_them_all/version', __FILE__)
|
2
|
+
|
3
|
+
# system libraries
|
4
|
+
require 'logger'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'zlib'
|
7
|
+
|
8
|
+
# gems
|
9
|
+
require 'eventmachine'
|
10
|
+
require 'active_support/core_ext/object/duplicable'
|
11
|
+
require 'active_support/core_ext/class'
|
12
|
+
require 'active_support/core_ext/string'
|
13
|
+
require 'active_support/core_ext/array'
|
14
|
+
|
15
|
+
# local files
|
16
|
+
Dir.chdir( File.join(File.dirname(__FILE__), "get_them_all") ) do
|
17
|
+
require './logger'
|
18
|
+
|
19
|
+
# libraries
|
20
|
+
require './notifier'
|
21
|
+
require './javascript_loader'
|
22
|
+
require './history'
|
23
|
+
|
24
|
+
# Storage
|
25
|
+
require './storage'
|
26
|
+
require './storage/file_storage'
|
27
|
+
require './storage/dropbox_storage'
|
28
|
+
|
29
|
+
# extensions
|
30
|
+
require './extension'
|
31
|
+
require './extensions/graph_builder'
|
32
|
+
require './extensions/action_logger'
|
33
|
+
require './extensions/gauge_display'
|
34
|
+
|
35
|
+
# main files
|
36
|
+
require './site_downloader'
|
37
|
+
require './worker'
|
38
|
+
require './action'
|
39
|
+
require './actions/examine_action'
|
40
|
+
require './actions/download_action'
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
module GetThemAll
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module GetThemAll
|
2
|
+
class Action
|
3
|
+
include Notifier
|
4
|
+
|
5
|
+
attr_accessor :url, :level, :destination_folder, :params, :referer
|
6
|
+
attr_accessor :parent_url
|
7
|
+
|
8
|
+
include EM::Deferrable
|
9
|
+
|
10
|
+
def initialize(downloader, h, params = {})
|
11
|
+
@downloader = downloader
|
12
|
+
|
13
|
+
@storage = @downloader.storage
|
14
|
+
|
15
|
+
@level= 0
|
16
|
+
@params= h.delete(:params)
|
17
|
+
@destination_folder= nil
|
18
|
+
|
19
|
+
h.each do |key, val|
|
20
|
+
raise ("unknown properties #{key} !") unless respond_to?("#{key}=")
|
21
|
+
send("#{key}=", val) unless val.nil?
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def inspect
|
26
|
+
"{#{self.class}[#{level}] #{url} }"
|
27
|
+
end
|
28
|
+
|
29
|
+
def uri
|
30
|
+
URI.parse(@url)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def already_visited?(url)
|
35
|
+
@downloader.history.include?(url)
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# internals
|
40
|
+
def queue_action(action)
|
41
|
+
action.parent_url = @url
|
42
|
+
action.destination_folder ||= @destination_folder
|
43
|
+
|
44
|
+
queue = action.is_a?(ExamineAction) ? "@examine_queue" : "@download_queue"
|
45
|
+
@downloader.instance_variable_get(queue).push(action, action.priority)
|
46
|
+
end
|
47
|
+
|
48
|
+
# return a number between 0.1 and 1
|
49
|
+
def retry_time
|
50
|
+
0.1 * (rand(1000)+1)/100
|
51
|
+
end
|
52
|
+
protected :retry_time
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module GetThemAll
|
2
|
+
class DownloadAction < Action
|
3
|
+
def priority
|
4
|
+
10
|
5
|
+
end
|
6
|
+
|
7
|
+
def do_action(worker = nil)
|
8
|
+
notify('action.download.started', worker, self)
|
9
|
+
|
10
|
+
if already_visited?(@url)
|
11
|
+
notify('action.download.skipped', worker, self)
|
12
|
+
set_deferred_status(:succeeded)
|
13
|
+
else
|
14
|
+
|
15
|
+
req = @downloader.open_url(@url, "GET", nil, @referer)
|
16
|
+
req.callback do |req|
|
17
|
+
|
18
|
+
destpath = compute_filename(worker)
|
19
|
+
download = @storage.write(destpath, req.response)
|
20
|
+
|
21
|
+
download.callback do
|
22
|
+
add_to_history()
|
23
|
+
set_deferred_status(:succeeded)
|
24
|
+
|
25
|
+
notify('action.download.success', worker, self, destpath)
|
26
|
+
end
|
27
|
+
|
28
|
+
download.errback do
|
29
|
+
notify('action.download.failure', worker, self)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
req.timeout(5)
|
34
|
+
|
35
|
+
req.errback do |*args|
|
36
|
+
status = (args.size == 1) ? args.first : 0
|
37
|
+
|
38
|
+
# remove file if created
|
39
|
+
path = compute_filename(worker)
|
40
|
+
File.delete(path) if File.exist?(path)
|
41
|
+
|
42
|
+
notify('action.download.failure', worker, self)
|
43
|
+
|
44
|
+
set_deferred_status(:failed)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
def random_string(len=5)
|
52
|
+
ret= ""
|
53
|
+
chars= ("a".."z").to_a
|
54
|
+
1.upto(len) { |i| ret<< chars[rand(chars.size-1)] }
|
55
|
+
ret
|
56
|
+
end
|
57
|
+
|
58
|
+
def add_to_history()
|
59
|
+
if @downloader.class.history_tracking == :default
|
60
|
+
@downloader.history.add(@parent_url)
|
61
|
+
else
|
62
|
+
@downloader.history.add(@url)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def compute_filename(worker)
|
67
|
+
destpath= @downloader.get_file_destpath_from_action(self)
|
68
|
+
|
69
|
+
# find an unused filename
|
70
|
+
while @storage.exist?(destpath)
|
71
|
+
path, filename= File.dirname(destpath), File.basename(destpath).split(".")
|
72
|
+
filename= "#{filename[0]}_#{random_string(2)}.#{filename[1]}"
|
73
|
+
destpath= File.join(path, filename)
|
74
|
+
notify('action.download.renamed', worker, self, destpath)
|
75
|
+
end
|
76
|
+
|
77
|
+
destpath
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module GetThemAll
|
5
|
+
class ExamineAction < Action
|
6
|
+
|
7
|
+
def priority
|
8
|
+
@level
|
9
|
+
end
|
10
|
+
|
11
|
+
def do_action(worker = nil)
|
12
|
+
notify('action.examine.started', worker, self)
|
13
|
+
|
14
|
+
if already_visited?(@url)
|
15
|
+
notify('action.examine.skipped', worker, self)
|
16
|
+
set_deferred_status(:succeeded)
|
17
|
+
|
18
|
+
else
|
19
|
+
req = @downloader.open_url(@url, "GET", nil, @referer)
|
20
|
+
req.callback do |req|
|
21
|
+
doc = Hpricot( req.response )
|
22
|
+
|
23
|
+
actions = @downloader.examine_page(doc, @level, self)
|
24
|
+
actions.each do |action|
|
25
|
+
action.level = @level + 1
|
26
|
+
# action.params = @params.merge(action.params)
|
27
|
+
queue_action(action)
|
28
|
+
end
|
29
|
+
|
30
|
+
notify('action.examine.success', worker, self, actions)
|
31
|
+
set_deferred_status(:succeeded)
|
32
|
+
end
|
33
|
+
|
34
|
+
req.timeout(5)
|
35
|
+
|
36
|
+
req.errback do |*args|
|
37
|
+
status = (args.size == 1) ? args.first : 0
|
38
|
+
notify('action.examine.failure', worker, self, status)
|
39
|
+
set_deferred_status(:failed)
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'active_support/notifications'
|
2
|
+
|
3
|
+
module GetThemAll
|
4
|
+
class Extension
|
5
|
+
|
6
|
+
##
|
7
|
+
# Register a handler to call when this notification
|
8
|
+
# is sent
|
9
|
+
#
|
10
|
+
# @param [String] name notification identifier
|
11
|
+
#
|
12
|
+
def register_handler(name, &block)
|
13
|
+
ActiveSupport::Notifications.subscribe(name, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
|
2
|
+
module GetThemAll
|
3
|
+
##
|
4
|
+
# This extension can be considered as a verbose mode, it
|
5
|
+
# logs nearly every everything that happens.
|
6
|
+
#
|
7
|
+
class ActionLogger < Extension
|
8
|
+
def initialize
|
9
|
+
register_handler('downloader.started') do |name, downloader|
|
10
|
+
@skipped_files = 0
|
11
|
+
@download_files = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
register_handler('action.examine.started') do |name, worker, action|
|
15
|
+
log("Examining[#{action.level}] #{action.url}")
|
16
|
+
end
|
17
|
+
|
18
|
+
register_handler('action.examine.skipped') do |name, worker, action|
|
19
|
+
@skipped_files += 1
|
20
|
+
log("Skipping #{action.url}")
|
21
|
+
end
|
22
|
+
|
23
|
+
register_handler('action.examine.success') do |name, worker, action|
|
24
|
+
# do nothing
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
register_handler('action.download.started') do |name, worker, action|
|
29
|
+
log("Downloading #{action.url}")
|
30
|
+
end
|
31
|
+
|
32
|
+
register_handler('action.download.renamed') do |name, worker, action, new_path|
|
33
|
+
log("Renamed as #{File.basename(new_path)}")
|
34
|
+
end
|
35
|
+
|
36
|
+
register_handler('action.download.skipped') do |name, worker, action|
|
37
|
+
log("url Skipped: #{action.url}")
|
38
|
+
end
|
39
|
+
|
40
|
+
register_handler('action.download.success') do |name, worker, action, destpath|
|
41
|
+
@download_files += 1
|
42
|
+
log("File downloaded: #{destpath}")
|
43
|
+
end
|
44
|
+
|
45
|
+
register_handler('downloader.completed') do |name, worker, downloader|
|
46
|
+
log ""
|
47
|
+
log "Downloaded #{@download_files} files"
|
48
|
+
log "Skipped: #{@skipped_files}"
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
def log(str)
|
54
|
+
puts "[log] #{str}"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|