get_them_all 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -0
- data/README.md +77 -0
- data/bin/gta +54 -0
- data/get_them_all.gemspec +28 -0
- data/lib/get_them_all.rb +47 -0
- data/lib/get_them_all/action.rb +55 -0
- data/lib/get_them_all/actions/download_action.rb +80 -0
- data/lib/get_them_all/actions/examine_action.rb +47 -0
- data/lib/get_them_all/extension.rb +17 -0
- data/lib/get_them_all/extensions/action_logger.rb +58 -0
- data/lib/get_them_all/extensions/gauge_display.rb +155 -0
- data/lib/get_them_all/extensions/graph_builder.rb +138 -0
- data/lib/get_them_all/history.rb +23 -0
- data/lib/get_them_all/javascript_loader.rb +74 -0
- data/lib/get_them_all/logger.rb +35 -0
- data/lib/get_them_all/notifier.rb +7 -0
- data/lib/get_them_all/site_downloader.rb +332 -0
- data/lib/get_them_all/storage.rb +27 -0
- data/lib/get_them_all/storage/dropbox_storage.rb +114 -0
- data/lib/get_them_all/storage/file_storage.rb +40 -0
- data/lib/get_them_all/version.rb +3 -0
- data/lib/get_them_all/worker.rb +48 -0
- metadata +172 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
|
2
|
+
module GetThemAll
|
3
|
+
##
|
4
|
+
# This extension will show you a progress bar for each worker
|
5
|
+
# showing its current state:
|
6
|
+
# ~ = working
|
7
|
+
# D = url downloaded
|
8
|
+
# E = url examined
|
9
|
+
# . = url skipped
|
10
|
+
# x = url download failed
|
11
|
+
#
|
12
|
+
# The extension knowns how to handle the terminal width nicely, if
|
13
|
+
# it reachs the right end of the terminal the line will scroll
|
14
|
+
# (characters will disappear on the left while new appear on the right)
|
15
|
+
#
|
16
|
+
class GaugeDisplay < Extension
|
17
|
+
module Cursor
|
18
|
+
class << self
|
19
|
+
def up(n); print "\e[#{n}A" if n > 0; end
|
20
|
+
def down(n); print "\e[#{n}B" if n > 0; end
|
21
|
+
def right(n); print "\e[#{n}C" if n > 0; end
|
22
|
+
def left(n); print "\e[#{n}D" if n > 0; end
|
23
|
+
|
24
|
+
def col(n); print "\e[#{n}G"; end
|
25
|
+
def clear_line; print "\e[0K"; end
|
26
|
+
|
27
|
+
# save / restore
|
28
|
+
def save; print "\e[s"; end
|
29
|
+
def restore; print "\e[u"; end
|
30
|
+
|
31
|
+
# hide / show
|
32
|
+
def hide_cursor; print "\e[?25l"; end
|
33
|
+
def show_cursor; print "\e[?25h"; end
|
34
|
+
|
35
|
+
def screen_width
|
36
|
+
`tput cols`.strip.to_i
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize
|
42
|
+
register_handler('downloader.started', &method(:downloader_started))
|
43
|
+
# examine actions
|
44
|
+
register_handler('action.examine.started', &method(:work_started))
|
45
|
+
register_handler('action.examine.success', &method(:work_completed))
|
46
|
+
register_handler('action.examine.failure', &method(:work_failed))
|
47
|
+
register_handler('action.examine.skipped', &method(:work_skipped))
|
48
|
+
|
49
|
+
# download actions
|
50
|
+
register_handler('action.download.started', &method(:work_started))
|
51
|
+
register_handler('action.download.success', &method(:work_completed))
|
52
|
+
register_handler('action.download.failure', &method(:work_failed))
|
53
|
+
register_handler('action.download.skipped', &method(:work_skipped))
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def downloader_started(event_name, downloader)
|
58
|
+
# initialize the screen
|
59
|
+
@examiners = downloader.examiners_count
|
60
|
+
@downloaders = downloader.downloaders_count
|
61
|
+
|
62
|
+
# save cursor position before doing anything
|
63
|
+
Cursor.save()
|
64
|
+
|
65
|
+
# store screen state
|
66
|
+
@state = []
|
67
|
+
(@examiners + @downloaders).times do |n|
|
68
|
+
puts "#{n} "
|
69
|
+
@state[n] = "#{n} "
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def work_started(event_name, worker, action, *args)
|
74
|
+
Cursor.restore()
|
75
|
+
line = worker_line(worker)
|
76
|
+
|
77
|
+
# move to the correct line
|
78
|
+
Cursor.down( line )
|
79
|
+
|
80
|
+
# and column
|
81
|
+
Cursor.right(@state[line].size)
|
82
|
+
|
83
|
+
print "~"
|
84
|
+
end
|
85
|
+
|
86
|
+
def update_line(line, added_str)
|
87
|
+
# update internal state
|
88
|
+
state = @state[line]
|
89
|
+
state << added_str
|
90
|
+
|
91
|
+
# resize line if required
|
92
|
+
if state.size > Cursor.screen_width
|
93
|
+
header = state[0,2]
|
94
|
+
size = Cursor.screen_width - header.size
|
95
|
+
state = header + state[-size..-1]
|
96
|
+
end
|
97
|
+
|
98
|
+
# move to column 0
|
99
|
+
Cursor.col(0)
|
100
|
+
# erase entire line
|
101
|
+
Cursor.clear_line()
|
102
|
+
|
103
|
+
print state
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
def work_completed(event_name, worker, action, *args)
|
108
|
+
Cursor.restore()
|
109
|
+
line = worker_line(worker)
|
110
|
+
|
111
|
+
Cursor.down( line )
|
112
|
+
#Cursor.right(@state[line].size)
|
113
|
+
|
114
|
+
if worker.type == :examiner
|
115
|
+
update_line(line, "E")
|
116
|
+
else
|
117
|
+
update_line(line, "D")
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def work_failed(event_name, worker, action, *args)
|
123
|
+
Cursor.restore()
|
124
|
+
line = worker_line(worker)
|
125
|
+
|
126
|
+
# move to the correct line
|
127
|
+
Cursor.down( line )
|
128
|
+
|
129
|
+
# and column
|
130
|
+
# Cursor.right(2 + @state[line].size)
|
131
|
+
|
132
|
+
update_line(line, "x")
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def work_skipped(event_name, worker, action, *args)
|
137
|
+
Cursor.restore()
|
138
|
+
line = worker_line(worker)
|
139
|
+
|
140
|
+
# move to the correct line
|
141
|
+
Cursor.down( line )
|
142
|
+
|
143
|
+
# and column
|
144
|
+
# Cursor.right(2 + @state[line].size)
|
145
|
+
|
146
|
+
update_line(line, ".")
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
def worker_line(worker)
|
151
|
+
base_index = (worker.type == :examiner) ? 0 : @examiners
|
152
|
+
base_index + worker.index
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
|
2
|
+
module GetThemAll
|
3
|
+
##
|
4
|
+
# This extension will generate a textfile showing
|
5
|
+
# he hierarchy of the site which may be considered
|
6
|
+
# as a map.
|
7
|
+
#
|
8
|
+
# Its main purpose as a debugging tool to have a
|
9
|
+
# better view of what was going on inside.
|
10
|
+
#
|
11
|
+
# It was also capable of generating a dot file but this code
|
12
|
+
# is not up to date.
|
13
|
+
#
|
14
|
+
class GraphBuilder < Extension
|
15
|
+
class TreeNode
|
16
|
+
attr_reader :text, :children
|
17
|
+
|
18
|
+
def initialize(prefix, text)
|
19
|
+
@prefix = prefix
|
20
|
+
@text = text
|
21
|
+
@children = []
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_child(prefix, text)
|
25
|
+
ret = TreeNode.new(prefix, text)
|
26
|
+
@children << ret
|
27
|
+
ret
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_node(text)
|
31
|
+
if @text == text
|
32
|
+
self
|
33
|
+
elsif !@children.empty?
|
34
|
+
@children.map{|node| node.find_node(text) }.compact.first
|
35
|
+
else
|
36
|
+
nil
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def inspect
|
41
|
+
"{#{@text}}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def dump_to_file(path)
|
45
|
+
File.open(path, 'w') do |f|
|
46
|
+
f.write( dump_node(0) )
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def dump_node(level)
|
51
|
+
ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
|
52
|
+
@children.each do |node|
|
53
|
+
ret << node.dump_node(level + 1)
|
54
|
+
end
|
55
|
+
ret
|
56
|
+
end
|
57
|
+
protected :dump_node
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
def dump_to_dot_file(path)
|
63
|
+
File.open(path, 'w') do |f|
|
64
|
+
f.write(<<-EOF)
|
65
|
+
digraph toto {
|
66
|
+
node [shape=box];
|
67
|
+
#{dump_node_dot()}
|
68
|
+
}
|
69
|
+
EOF
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def dump_node_dot(prefix = nil)
|
74
|
+
# ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
|
75
|
+
if prefix.nil?
|
76
|
+
prefix = "\"#{@text}\""
|
77
|
+
else
|
78
|
+
prefix = " #{prefix} -> \"#{@text}\""
|
79
|
+
end
|
80
|
+
|
81
|
+
ret = [prefix]
|
82
|
+
|
83
|
+
@children.each do |node|
|
84
|
+
ret << node.dump_node("\"#{@text}\"")
|
85
|
+
end
|
86
|
+
|
87
|
+
ret.join("\n")
|
88
|
+
end
|
89
|
+
protected :dump_node_dot
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
##
|
97
|
+
# @param [String] path where the resulting file will be written.
|
98
|
+
#
|
99
|
+
def initialize(path)
|
100
|
+
@path = path
|
101
|
+
|
102
|
+
register_handler('downloader.started') do |name, downloader|
|
103
|
+
@graph = TreeNode.new("", downloader.base_url)
|
104
|
+
end
|
105
|
+
|
106
|
+
register_handler('downloader.completed') do |name, worker, downloader|
|
107
|
+
@graph.dump_to_file(@path)
|
108
|
+
# @graph.dump_to_dot_file(@path)
|
109
|
+
end
|
110
|
+
|
111
|
+
register_handler('action.download.success') do |name, worker, action|
|
112
|
+
add_to_graph("D", action.url, action.parent_url)
|
113
|
+
end
|
114
|
+
|
115
|
+
register_handler('action.examine.success') do |name, worker, action, returned_actions|
|
116
|
+
add_to_graph("E[ret:#{returned_actions.size}]", action.url, action.parent_url)
|
117
|
+
end
|
118
|
+
|
119
|
+
register_handler('action.examine.failure') do |name, worker, action, error_status|
|
120
|
+
add_to_graph(error_status, action.url, action.referer)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
127
|
+
def add_to_graph(prefix, url, referer)
|
128
|
+
if referer.nil?
|
129
|
+
@graph.add_child(prefix, url)
|
130
|
+
else
|
131
|
+
# find referer
|
132
|
+
parent = @graph.find_node(referer) or fail("node not found: #{referer}")
|
133
|
+
parent.add_child(prefix, url)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module GetThemAll
|
2
|
+
class History
|
3
|
+
def initialize(data = [])
|
4
|
+
@data = data
|
5
|
+
end
|
6
|
+
|
7
|
+
def load(data)
|
8
|
+
@data = data.split("\n")
|
9
|
+
end
|
10
|
+
|
11
|
+
def dump
|
12
|
+
@data.join("\n")
|
13
|
+
end
|
14
|
+
|
15
|
+
def include?(line)
|
16
|
+
@data.include?(line)
|
17
|
+
end
|
18
|
+
|
19
|
+
def add(url)
|
20
|
+
@data << url
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
require 'v8'
|
3
|
+
|
4
|
+
module GetThemAll
|
5
|
+
class JavascriptLoader
|
6
|
+
|
7
|
+
##
|
8
|
+
# The goal is just to have something acting like a dom
|
9
|
+
# from the outside
|
10
|
+
class DOM
|
11
|
+
def search; self; end
|
12
|
+
def substr(*); ""; end
|
13
|
+
def protocol; 'http'; end
|
14
|
+
|
15
|
+
def location; self; end
|
16
|
+
def appendChild(*); self; end
|
17
|
+
def createElement(*); self; end
|
18
|
+
def window; self; end
|
19
|
+
|
20
|
+
def getElementsByTagName(*); [self]; end
|
21
|
+
|
22
|
+
def ready(f)
|
23
|
+
f.call()
|
24
|
+
end
|
25
|
+
|
26
|
+
def click(*); end
|
27
|
+
def change(*); end
|
28
|
+
def keydown(*); end
|
29
|
+
def keyup(*); end
|
30
|
+
def setInterval(*); end
|
31
|
+
def setTimeout(*); end
|
32
|
+
def attr(*); self; end
|
33
|
+
def text(*); self; end
|
34
|
+
def animate(*); self; end
|
35
|
+
def empty(*); self; end
|
36
|
+
def hide(*); self; end
|
37
|
+
def show(*); self; end
|
38
|
+
def focus(*); self; end
|
39
|
+
end
|
40
|
+
|
41
|
+
class JQuery
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def initialize(source)
|
46
|
+
@context = V8::Context.new do |ctx|
|
47
|
+
ctx[:document] = DOM.new()
|
48
|
+
ctx[:window] = DOM.new()
|
49
|
+
ctx[:jQuery] = JQuery.new
|
50
|
+
ctx['setInterval'] = ctx[:window].method(:setInterval)
|
51
|
+
ctx['setTimeout'] = ctx[:window].method(:setTimeout)
|
52
|
+
end
|
53
|
+
|
54
|
+
@context.eval(%{
|
55
|
+
$ = function(){
|
56
|
+
return document;
|
57
|
+
};
|
58
|
+
|
59
|
+
$.cookie = function(){};
|
60
|
+
$.each = function(){};
|
61
|
+
|
62
|
+
// hash method cannot be defined on ruby side...
|
63
|
+
window.location.hash = window;
|
64
|
+
})
|
65
|
+
|
66
|
+
@context.eval(source)
|
67
|
+
end
|
68
|
+
|
69
|
+
def eval(str)
|
70
|
+
@context.eval(str)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module GetThemAll
|
4
|
+
class SiteDownloaderLogger < Logger
|
5
|
+
def format_message(level, time, progname, msg)
|
6
|
+
"[#{level} -- #{time.strftime("%Y/%m/%d %H:%M:%S")}] #{msg}\n"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class SiteDownloader
|
11
|
+
cattr_accessor :logger
|
12
|
+
|
13
|
+
self.logger= SiteDownloaderLogger.new(STDOUT)
|
14
|
+
self.logger.level = Logger::ERROR
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# helpers
|
19
|
+
def debug(msg, short_msg = nil)
|
20
|
+
GetThemAll::SiteDownloader::logger.debug(msg)
|
21
|
+
end
|
22
|
+
|
23
|
+
def info(msg)
|
24
|
+
GetThemAll::SiteDownloader::logger.info(msg)
|
25
|
+
end
|
26
|
+
|
27
|
+
def warn(msg)
|
28
|
+
GetThemAll::SiteDownloader::logger.warn(msg)
|
29
|
+
end
|
30
|
+
|
31
|
+
def error(msg, fatal = false)
|
32
|
+
GetThemAll::SiteDownloader::logger.error(msg)
|
33
|
+
exit(1) if fatal
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,332 @@
|
|
1
|
+
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'active_support/hash_with_indifferent_access'
|
4
|
+
|
5
|
+
require 'em-http-request'
|
6
|
+
require 'em-priority-queue'
|
7
|
+
|
8
|
+
module GetThemAll
|
9
|
+
|
10
|
+
##
|
11
|
+
# The main class, all your crawlers will derive from this class
|
12
|
+
# see examples/standalone.rb file for an example.
|
13
|
+
#
|
14
|
+
class SiteDownloader
|
15
|
+
include Notifier
|
16
|
+
|
17
|
+
class_attribute :examiners_count, :downloaders_count
|
18
|
+
class_attribute :config
|
19
|
+
|
20
|
+
##
|
21
|
+
# Determine what will be stored in the history file,
|
22
|
+
# the default is to store the last url before the download
|
23
|
+
# so we can ignore it sooner next time.
|
24
|
+
#
|
25
|
+
# The other mode is :download, in this mode the download
|
26
|
+
# url itself will be stored, it is meant for special cases as
|
27
|
+
# the default should work better most of the time.
|
28
|
+
#
|
29
|
+
class_attribute :history_tracking
|
30
|
+
|
31
|
+
self.examiners_count = 1
|
32
|
+
self.downloaders_count = 1
|
33
|
+
|
34
|
+
self.history_tracking = :default
|
35
|
+
|
36
|
+
attr_reader :base_url, :storage, :history
|
37
|
+
|
38
|
+
##
|
39
|
+
# Create and start the crawler.
|
40
|
+
#
|
41
|
+
# @param [Hash] args arguments
|
42
|
+
# @option args [String] :base_url The root url, every other
|
43
|
+
# url will be relative to this.
|
44
|
+
# @option args [String] :start_url What is the very first url
|
45
|
+
# to examine (level 0) relative to base_url, default is "/"
|
46
|
+
# @option args [String] :folder_name The root path where
|
47
|
+
# downloaded files will be saved (appended to the storage root).
|
48
|
+
# @option args [Array] :extensions Array of Extension object.
|
49
|
+
#
|
50
|
+
# @option args [Hash] :storage Configure storage backend
|
51
|
+
# :type is the backend name
|
52
|
+
# :params is a hash with backend specific options
|
53
|
+
#
|
54
|
+
def initialize(args)
|
55
|
+
@cookies = []
|
56
|
+
@history = []
|
57
|
+
@connections = {}
|
58
|
+
@examine_queue = EM::PriorityQueue.new
|
59
|
+
@download_queue = EM::PriorityQueue.new
|
60
|
+
@history = History.new
|
61
|
+
|
62
|
+
@base_url= args.delete(:base_url)
|
63
|
+
@start_url = args.delete(:start_url) || '/'
|
64
|
+
@folder_name= args.delete(:folder_name)
|
65
|
+
|
66
|
+
# keep a pointer to each extension
|
67
|
+
@extensions = args.delete(:extensions) || [ActionLogger]
|
68
|
+
|
69
|
+
storage_options = args.delete(:storage)
|
70
|
+
raise "storage required" unless storage_options
|
71
|
+
raise "storage type required" unless storage_options[:type]
|
72
|
+
raise "storage params required" unless storage_options[:params]
|
73
|
+
|
74
|
+
storage_class = "#{storage_options[:type].camelize}Storage"
|
75
|
+
raise "unknown storage: #{storage_class}" unless defined?(storage_class)
|
76
|
+
|
77
|
+
storage_class = GetThemAll.const_get(storage_class)
|
78
|
+
|
79
|
+
storage_class = storage_class
|
80
|
+
storage_options = ActiveSupport::HashWithIndifferentAccess.new( storage_options[:params] )
|
81
|
+
|
82
|
+
@storage = storage_class.new(storage_options.merge(:folder_name => @folder_name))
|
83
|
+
|
84
|
+
@parsed_base_url = Addressable::URI.parse(@base_url)
|
85
|
+
|
86
|
+
# start_url is relative to base_url
|
87
|
+
@start_url = File.join(@base_url, @start_url)
|
88
|
+
|
89
|
+
# if any unknown option was passed, do not silently walk away
|
90
|
+
# tell the user !
|
91
|
+
unless args.empty?
|
92
|
+
raise "unknown parameters: #{args.inspect}"
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Start the crawler, if you pass a block it
|
99
|
+
# will be called after the engine is iniailized, you can
|
100
|
+
# queue the level 0 urls here and handle authenticating if needed.
|
101
|
+
#
|
102
|
+
def start
|
103
|
+
load_history()
|
104
|
+
|
105
|
+
notify('downloader.started', self)
|
106
|
+
|
107
|
+
EM::run do
|
108
|
+
EM::add_periodic_timer(5) do
|
109
|
+
if (EM::connection_count() == 0) && !@storage.working?
|
110
|
+
debug("no connections, exiting")
|
111
|
+
EM::stop_event_loop()
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
EM::error_handler do |err|
|
116
|
+
if err.is_a?(AssertionFailed)
|
117
|
+
error("Assertion failed: #{err.message}")
|
118
|
+
else
|
119
|
+
error("#{err.class}: #{err.message}")
|
120
|
+
err.backtrace.each do |line|
|
121
|
+
error(line)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
# queue the first action to start crawling
|
128
|
+
#
|
129
|
+
@examine_queue.push(ExamineAction.new(self,
|
130
|
+
:url => @start_url,
|
131
|
+
:destination_folder => '/',
|
132
|
+
:level => 0,
|
133
|
+
), 0)
|
134
|
+
|
135
|
+
|
136
|
+
# now that actions are queued, start handling them
|
137
|
+
# start each "worker"
|
138
|
+
# dequeuing is priority based, the download actions
|
139
|
+
# first and then the higher the level the higher the
|
140
|
+
# priority for examine actions, this is done this way
|
141
|
+
# to give work to the download workers asap.
|
142
|
+
#
|
143
|
+
1.upto(self.class.examiners_count) do |n|
|
144
|
+
Worker.new(:examiner, n - 1, self, @examine_queue)
|
145
|
+
end
|
146
|
+
|
147
|
+
1.upto(self.class.downloaders_count) do |n|
|
148
|
+
Worker.new(:downloader, n - 1, self, @download_queue)
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
save_history()
|
154
|
+
|
155
|
+
notify('downloader.completed', self)
|
156
|
+
end
|
157
|
+
|
158
|
+
class AssertionFailed < RuntimeError; end
|
159
|
+
|
160
|
+
def assert(cond, msg = "")
|
161
|
+
unless cond
|
162
|
+
raise AssertionFailed, msg
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# toto.com
|
167
|
+
# sub.toto.com
|
168
|
+
|
169
|
+
##
|
170
|
+
# Check if two urls are from the same domain.
|
171
|
+
#
|
172
|
+
# @return [Boolean] true if same domain
|
173
|
+
#
|
174
|
+
def same_domain?(host1, host2)
|
175
|
+
|
176
|
+
host1_parts = host1.split(".")
|
177
|
+
host2_parts = host2.split(".")
|
178
|
+
|
179
|
+
size = [host1_parts.size, host2_parts.size].min # => 2
|
180
|
+
|
181
|
+
host1_parts= host1_parts[-size..-1] if host1_parts.size > size
|
182
|
+
host2_parts= host2_parts[-size..-1] if host2_parts.size > size
|
183
|
+
|
184
|
+
ret = host1_parts.join(".") == host2_parts.join(".")
|
185
|
+
|
186
|
+
return ret
|
187
|
+
end
|
188
|
+
|
189
|
+
HISTORY_FILE_PATH = 'history.txt'.freeze
|
190
|
+
|
191
|
+
# load already downloaded pictures from disk
|
192
|
+
def load_history
|
193
|
+
if @storage.exist?(HISTORY_FILE_PATH)
|
194
|
+
data = @storage.read(HISTORY_FILE_PATH)
|
195
|
+
@history.load(data)
|
196
|
+
else
|
197
|
+
debug "History file not found: #{HISTORY_FILE_PATH}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def save_history
|
202
|
+
@storage.write(HISTORY_FILE_PATH, @history.dump)
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
# Plugin API
|
209
|
+
def open_url(url, method = "GET", params = nil, referer = nil, deferrable = nil, &block)
|
210
|
+
deferrable ||= EM::DefaultDeferrable.new
|
211
|
+
referer ||= @base_url
|
212
|
+
|
213
|
+
url = Addressable::URI.parse( (url[0...4] == "http") ? url : URI.join(@base_url, url) )
|
214
|
+
|
215
|
+
# url = (url[0...4] == "http") ? URI.parse(url) : URI.join(@base_url, url)
|
216
|
+
# url_path = url.path
|
217
|
+
|
218
|
+
# get queries with params
|
219
|
+
# if method == "GET" && url.query
|
220
|
+
# url_path << "?#{url.query}"
|
221
|
+
# end
|
222
|
+
|
223
|
+
external = !same_domain?(@parsed_base_url.host, url.host)
|
224
|
+
|
225
|
+
if external
|
226
|
+
debug("Opening external page: #{url}")
|
227
|
+
else
|
228
|
+
# debugger if url.to_s == "http://fan tasti.cc/user/pussylover75/images/image/367771"
|
229
|
+
debug("#{method.upcase} #{url}")
|
230
|
+
end
|
231
|
+
|
232
|
+
|
233
|
+
# find a connection for this host
|
234
|
+
host_key = "#{url.host}:#{url.port}"
|
235
|
+
if false
|
236
|
+
# if @connections.has_key?(host_key) && !@connections[host_key].error?
|
237
|
+
http = @connections[host_key]
|
238
|
+
else
|
239
|
+
# debug("New connection to http://#{url.host}:#{url.port}", "C")
|
240
|
+
http = EM::HttpRequest.new("http://#{url.host}:#{url.port}")
|
241
|
+
|
242
|
+
@connections[host_key] = http
|
243
|
+
end
|
244
|
+
|
245
|
+
req = http.setup_request(method.downcase.to_sym,
|
246
|
+
:path => url.path,
|
247
|
+
:query => url.query,
|
248
|
+
# :redirects => 2,
|
249
|
+
:head => {
|
250
|
+
:cookie => @cookies,
|
251
|
+
:referer => referer,
|
252
|
+
# "accept-encoding" => "gzip, compressed",
|
253
|
+
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
# req.timeout(10)
|
258
|
+
# req.errback do
|
259
|
+
# error("error while opening #{url} :(")
|
260
|
+
# end
|
261
|
+
|
262
|
+
req.callback do
|
263
|
+
case req.response_header.status
|
264
|
+
when 200
|
265
|
+
# handle cookies
|
266
|
+
unless external
|
267
|
+
# [["a=42", "PHPSESSID2=u0ctlbfrlrnus1qv8425uv4p42"], ["PHPSESSID=2jek8d61dlt134e0djft4hnn54; path=/", "OAGEO=FR%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/"]]
|
268
|
+
# [["a=42"], "PHPSESSID=p12aet71oemrfb3olffqaptss3; path=/"]
|
269
|
+
added_cookies = Array(req.cookies[1])
|
270
|
+
|
271
|
+
added_cookies.each do |str|
|
272
|
+
@cookies << str.split(';').first
|
273
|
+
end
|
274
|
+
|
275
|
+
# remove duplicates
|
276
|
+
@cookies.uniq!
|
277
|
+
|
278
|
+
# req.added_cookies.each{|key,val| @cookies[key] = val }
|
279
|
+
# req.deleted_cookies.each{|key, _| @cookies.delete(key) }
|
280
|
+
end
|
281
|
+
|
282
|
+
# debug("page loaded succesfully: #{url}")
|
283
|
+
deferrable.set_deferred_status(:succeeded, req)
|
284
|
+
if block
|
285
|
+
if block.arity == 2
|
286
|
+
doc = Hpricot(req.response)
|
287
|
+
block.call(req, doc)
|
288
|
+
else
|
289
|
+
block.call(req)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# em-http-request does not handle redirection between hosts
|
294
|
+
# so handle them ourselves
|
295
|
+
when 301, 302
|
296
|
+
location = req.response_header.location
|
297
|
+
if location
|
298
|
+
debug("Following redirection: #{location}")
|
299
|
+
# reuse the same deferrable object
|
300
|
+
open_url(location, method, params, referer, deferrable, &block)
|
301
|
+
end
|
302
|
+
|
303
|
+
else
|
304
|
+
puts "#{method} #{url} => Status: #{req.response_header.status}"
|
305
|
+
deferrable.set_deferred_status(:failed, req.response_header.http_reason)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
req.errback do
|
310
|
+
deferrable.set_deferred_status(:failed, -1)
|
311
|
+
end
|
312
|
+
|
313
|
+
deferrable
|
314
|
+
end
|
315
|
+
|
316
|
+
def eval_javascript(data)
|
317
|
+
JavascriptLoader.new(data)
|
318
|
+
end
|
319
|
+
|
320
|
+
|
321
|
+
|
322
|
+
# to be redefine in subclasses
|
323
|
+
def examine_page(doc, level)
|
324
|
+
raise "Need to implement examine_page in #{self.class}"
|
325
|
+
end
|
326
|
+
|
327
|
+
def get_file_destpath_from_action(action)
|
328
|
+
url_folder = action.uri.path
|
329
|
+
File.join(action.destination_folder, url_folder)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|