get_them_all 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -0
- data/README.md +77 -0
- data/bin/gta +54 -0
- data/get_them_all.gemspec +28 -0
- data/lib/get_them_all.rb +47 -0
- data/lib/get_them_all/action.rb +55 -0
- data/lib/get_them_all/actions/download_action.rb +80 -0
- data/lib/get_them_all/actions/examine_action.rb +47 -0
- data/lib/get_them_all/extension.rb +17 -0
- data/lib/get_them_all/extensions/action_logger.rb +58 -0
- data/lib/get_them_all/extensions/gauge_display.rb +155 -0
- data/lib/get_them_all/extensions/graph_builder.rb +138 -0
- data/lib/get_them_all/history.rb +23 -0
- data/lib/get_them_all/javascript_loader.rb +74 -0
- data/lib/get_them_all/logger.rb +35 -0
- data/lib/get_them_all/notifier.rb +7 -0
- data/lib/get_them_all/site_downloader.rb +332 -0
- data/lib/get_them_all/storage.rb +27 -0
- data/lib/get_them_all/storage/dropbox_storage.rb +114 -0
- data/lib/get_them_all/storage/file_storage.rb +40 -0
- data/lib/get_them_all/version.rb +3 -0
- data/lib/get_them_all/worker.rb +48 -0
- metadata +172 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
|
2
|
+
module GetThemAll
|
3
|
+
##
|
4
|
+
# This extension will show you a progress bar for each worker
|
5
|
+
# showing its current state:
|
6
|
+
# ~ = working
|
7
|
+
# D = url downloaded
|
8
|
+
# E = url examined
|
9
|
+
# . = url skipped
|
10
|
+
# x = url download failed
|
11
|
+
#
|
12
|
+
# The extension knowns how to handle the terminal width nicely, if
|
13
|
+
# it reachs the right end of the terminal the line will scroll
|
14
|
+
# (characters will disappear on the left while new appear on the right)
|
15
|
+
#
|
16
|
+
class GaugeDisplay < Extension
|
17
|
+
module Cursor
|
18
|
+
class << self
|
19
|
+
def up(n); print "\e[#{n}A" if n > 0; end
|
20
|
+
def down(n); print "\e[#{n}B" if n > 0; end
|
21
|
+
def right(n); print "\e[#{n}C" if n > 0; end
|
22
|
+
def left(n); print "\e[#{n}D" if n > 0; end
|
23
|
+
|
24
|
+
def col(n); print "\e[#{n}G"; end
|
25
|
+
def clear_line; print "\e[0K"; end
|
26
|
+
|
27
|
+
# save / restore
|
28
|
+
def save; print "\e[s"; end
|
29
|
+
def restore; print "\e[u"; end
|
30
|
+
|
31
|
+
# hide / show
|
32
|
+
def hide_cursor; print "\e[?25l"; end
|
33
|
+
def show_cursor; print "\e[?25h"; end
|
34
|
+
|
35
|
+
def screen_width
|
36
|
+
`tput cols`.strip.to_i
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize
|
42
|
+
register_handler('downloader.started', &method(:downloader_started))
|
43
|
+
# examine actions
|
44
|
+
register_handler('action.examine.started', &method(:work_started))
|
45
|
+
register_handler('action.examine.success', &method(:work_completed))
|
46
|
+
register_handler('action.examine.failure', &method(:work_failed))
|
47
|
+
register_handler('action.examine.skipped', &method(:work_skipped))
|
48
|
+
|
49
|
+
# download actions
|
50
|
+
register_handler('action.download.started', &method(:work_started))
|
51
|
+
register_handler('action.download.success', &method(:work_completed))
|
52
|
+
register_handler('action.download.failure', &method(:work_failed))
|
53
|
+
register_handler('action.download.skipped', &method(:work_skipped))
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def downloader_started(event_name, downloader)
|
58
|
+
# initialize the screen
|
59
|
+
@examiners = downloader.examiners_count
|
60
|
+
@downloaders = downloader.downloaders_count
|
61
|
+
|
62
|
+
# save cursor position before doing anything
|
63
|
+
Cursor.save()
|
64
|
+
|
65
|
+
# store screen state
|
66
|
+
@state = []
|
67
|
+
(@examiners + @downloaders).times do |n|
|
68
|
+
puts "#{n} "
|
69
|
+
@state[n] = "#{n} "
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def work_started(event_name, worker, action, *args)
|
74
|
+
Cursor.restore()
|
75
|
+
line = worker_line(worker)
|
76
|
+
|
77
|
+
# move to the correct line
|
78
|
+
Cursor.down( line )
|
79
|
+
|
80
|
+
# and column
|
81
|
+
Cursor.right(@state[line].size)
|
82
|
+
|
83
|
+
print "~"
|
84
|
+
end
|
85
|
+
|
86
|
+
def update_line(line, added_str)
|
87
|
+
# update internal state
|
88
|
+
state = @state[line]
|
89
|
+
state << added_str
|
90
|
+
|
91
|
+
# resize line if required
|
92
|
+
if state.size > Cursor.screen_width
|
93
|
+
header = state[0,2]
|
94
|
+
size = Cursor.screen_width - header.size
|
95
|
+
state = header + state[-size..-1]
|
96
|
+
end
|
97
|
+
|
98
|
+
# move to column 0
|
99
|
+
Cursor.col(0)
|
100
|
+
# erase entire line
|
101
|
+
Cursor.clear_line()
|
102
|
+
|
103
|
+
print state
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
def work_completed(event_name, worker, action, *args)
|
108
|
+
Cursor.restore()
|
109
|
+
line = worker_line(worker)
|
110
|
+
|
111
|
+
Cursor.down( line )
|
112
|
+
#Cursor.right(@state[line].size)
|
113
|
+
|
114
|
+
if worker.type == :examiner
|
115
|
+
update_line(line, "E")
|
116
|
+
else
|
117
|
+
update_line(line, "D")
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def work_failed(event_name, worker, action, *args)
|
123
|
+
Cursor.restore()
|
124
|
+
line = worker_line(worker)
|
125
|
+
|
126
|
+
# move to the correct line
|
127
|
+
Cursor.down( line )
|
128
|
+
|
129
|
+
# and column
|
130
|
+
# Cursor.right(2 + @state[line].size)
|
131
|
+
|
132
|
+
update_line(line, "x")
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def work_skipped(event_name, worker, action, *args)
|
137
|
+
Cursor.restore()
|
138
|
+
line = worker_line(worker)
|
139
|
+
|
140
|
+
# move to the correct line
|
141
|
+
Cursor.down( line )
|
142
|
+
|
143
|
+
# and column
|
144
|
+
# Cursor.right(2 + @state[line].size)
|
145
|
+
|
146
|
+
update_line(line, ".")
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
def worker_line(worker)
|
151
|
+
base_index = (worker.type == :examiner) ? 0 : @examiners
|
152
|
+
base_index + worker.index
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
|
2
|
+
module GetThemAll
|
3
|
+
##
|
4
|
+
# This extension will generate a textfile showing
|
5
|
+
# he hierarchy of the site which may be considered
|
6
|
+
# as a map.
|
7
|
+
#
|
8
|
+
# Its main purpose as a debugging tool to have a
|
9
|
+
# better view of what was going on inside.
|
10
|
+
#
|
11
|
+
# It was also capable of generating a dot file but this code
|
12
|
+
# is not up to date.
|
13
|
+
#
|
14
|
+
class GraphBuilder < Extension
|
15
|
+
class TreeNode
|
16
|
+
attr_reader :text, :children
|
17
|
+
|
18
|
+
def initialize(prefix, text)
|
19
|
+
@prefix = prefix
|
20
|
+
@text = text
|
21
|
+
@children = []
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_child(prefix, text)
|
25
|
+
ret = TreeNode.new(prefix, text)
|
26
|
+
@children << ret
|
27
|
+
ret
|
28
|
+
end
|
29
|
+
|
30
|
+
def find_node(text)
|
31
|
+
if @text == text
|
32
|
+
self
|
33
|
+
elsif !@children.empty?
|
34
|
+
@children.map{|node| node.find_node(text) }.compact.first
|
35
|
+
else
|
36
|
+
nil
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def inspect
|
41
|
+
"{#{@text}}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def dump_to_file(path)
|
45
|
+
File.open(path, 'w') do |f|
|
46
|
+
f.write( dump_node(0) )
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def dump_node(level)
|
51
|
+
ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
|
52
|
+
@children.each do |node|
|
53
|
+
ret << node.dump_node(level + 1)
|
54
|
+
end
|
55
|
+
ret
|
56
|
+
end
|
57
|
+
protected :dump_node
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
def dump_to_dot_file(path)
|
63
|
+
File.open(path, 'w') do |f|
|
64
|
+
f.write(<<-EOF)
|
65
|
+
digraph toto {
|
66
|
+
node [shape=box];
|
67
|
+
#{dump_node_dot()}
|
68
|
+
}
|
69
|
+
EOF
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def dump_node_dot(prefix = nil)
|
74
|
+
# ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
|
75
|
+
if prefix.nil?
|
76
|
+
prefix = "\"#{@text}\""
|
77
|
+
else
|
78
|
+
prefix = " #{prefix} -> \"#{@text}\""
|
79
|
+
end
|
80
|
+
|
81
|
+
ret = [prefix]
|
82
|
+
|
83
|
+
@children.each do |node|
|
84
|
+
ret << node.dump_node("\"#{@text}\"")
|
85
|
+
end
|
86
|
+
|
87
|
+
ret.join("\n")
|
88
|
+
end
|
89
|
+
protected :dump_node_dot
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
##
|
97
|
+
# @param [String] path where the resulting file will be written.
|
98
|
+
#
|
99
|
+
def initialize(path)
|
100
|
+
@path = path
|
101
|
+
|
102
|
+
register_handler('downloader.started') do |name, downloader|
|
103
|
+
@graph = TreeNode.new("", downloader.base_url)
|
104
|
+
end
|
105
|
+
|
106
|
+
register_handler('downloader.completed') do |name, worker, downloader|
|
107
|
+
@graph.dump_to_file(@path)
|
108
|
+
# @graph.dump_to_dot_file(@path)
|
109
|
+
end
|
110
|
+
|
111
|
+
register_handler('action.download.success') do |name, worker, action|
|
112
|
+
add_to_graph("D", action.url, action.parent_url)
|
113
|
+
end
|
114
|
+
|
115
|
+
register_handler('action.examine.success') do |name, worker, action, returned_actions|
|
116
|
+
add_to_graph("E[ret:#{returned_actions.size}]", action.url, action.parent_url)
|
117
|
+
end
|
118
|
+
|
119
|
+
register_handler('action.examine.failure') do |name, worker, action, error_status|
|
120
|
+
add_to_graph(error_status, action.url, action.referer)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
127
|
+
def add_to_graph(prefix, url, referer)
|
128
|
+
if referer.nil?
|
129
|
+
@graph.add_child(prefix, url)
|
130
|
+
else
|
131
|
+
# find referer
|
132
|
+
parent = @graph.find_node(referer) or fail("node not found: #{referer}")
|
133
|
+
parent.add_child(prefix, url)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module GetThemAll
|
2
|
+
class History
|
3
|
+
def initialize(data = [])
|
4
|
+
@data = data
|
5
|
+
end
|
6
|
+
|
7
|
+
def load(data)
|
8
|
+
@data = data.split("\n")
|
9
|
+
end
|
10
|
+
|
11
|
+
def dump
|
12
|
+
@data.join("\n")
|
13
|
+
end
|
14
|
+
|
15
|
+
def include?(line)
|
16
|
+
@data.include?(line)
|
17
|
+
end
|
18
|
+
|
19
|
+
def add(url)
|
20
|
+
@data << url
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
require 'v8'
|
3
|
+
|
4
|
+
module GetThemAll
|
5
|
+
class JavascriptLoader
|
6
|
+
|
7
|
+
##
|
8
|
+
# The goal is just to have something acting like a dom
|
9
|
+
# from the outside
|
10
|
+
class DOM
|
11
|
+
def search; self; end
|
12
|
+
def substr(*); ""; end
|
13
|
+
def protocol; 'http'; end
|
14
|
+
|
15
|
+
def location; self; end
|
16
|
+
def appendChild(*); self; end
|
17
|
+
def createElement(*); self; end
|
18
|
+
def window; self; end
|
19
|
+
|
20
|
+
def getElementsByTagName(*); [self]; end
|
21
|
+
|
22
|
+
def ready(f)
|
23
|
+
f.call()
|
24
|
+
end
|
25
|
+
|
26
|
+
def click(*); end
|
27
|
+
def change(*); end
|
28
|
+
def keydown(*); end
|
29
|
+
def keyup(*); end
|
30
|
+
def setInterval(*); end
|
31
|
+
def setTimeout(*); end
|
32
|
+
def attr(*); self; end
|
33
|
+
def text(*); self; end
|
34
|
+
def animate(*); self; end
|
35
|
+
def empty(*); self; end
|
36
|
+
def hide(*); self; end
|
37
|
+
def show(*); self; end
|
38
|
+
def focus(*); self; end
|
39
|
+
end
|
40
|
+
|
41
|
+
class JQuery
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def initialize(source)
|
46
|
+
@context = V8::Context.new do |ctx|
|
47
|
+
ctx[:document] = DOM.new()
|
48
|
+
ctx[:window] = DOM.new()
|
49
|
+
ctx[:jQuery] = JQuery.new
|
50
|
+
ctx['setInterval'] = ctx[:window].method(:setInterval)
|
51
|
+
ctx['setTimeout'] = ctx[:window].method(:setTimeout)
|
52
|
+
end
|
53
|
+
|
54
|
+
@context.eval(%{
|
55
|
+
$ = function(){
|
56
|
+
return document;
|
57
|
+
};
|
58
|
+
|
59
|
+
$.cookie = function(){};
|
60
|
+
$.each = function(){};
|
61
|
+
|
62
|
+
// hash method cannot be defined on ruby side...
|
63
|
+
window.location.hash = window;
|
64
|
+
})
|
65
|
+
|
66
|
+
@context.eval(source)
|
67
|
+
end
|
68
|
+
|
69
|
+
def eval(str)
|
70
|
+
@context.eval(str)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module GetThemAll
|
4
|
+
class SiteDownloaderLogger < Logger
|
5
|
+
def format_message(level, time, progname, msg)
|
6
|
+
"[#{level} -- #{time.strftime("%Y/%m/%d %H:%M:%S")}] #{msg}\n"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class SiteDownloader
|
11
|
+
cattr_accessor :logger
|
12
|
+
|
13
|
+
self.logger= SiteDownloaderLogger.new(STDOUT)
|
14
|
+
self.logger.level = Logger::ERROR
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# helpers
|
19
|
+
def debug(msg, short_msg = nil)
|
20
|
+
GetThemAll::SiteDownloader::logger.debug(msg)
|
21
|
+
end
|
22
|
+
|
23
|
+
def info(msg)
|
24
|
+
GetThemAll::SiteDownloader::logger.info(msg)
|
25
|
+
end
|
26
|
+
|
27
|
+
def warn(msg)
|
28
|
+
GetThemAll::SiteDownloader::logger.warn(msg)
|
29
|
+
end
|
30
|
+
|
31
|
+
def error(msg, fatal = false)
|
32
|
+
GetThemAll::SiteDownloader::logger.error(msg)
|
33
|
+
exit(1) if fatal
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,332 @@
|
|
1
|
+
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'active_support/hash_with_indifferent_access'
|
4
|
+
|
5
|
+
require 'em-http-request'
|
6
|
+
require 'em-priority-queue'
|
7
|
+
|
8
|
+
module GetThemAll
|
9
|
+
|
10
|
+
##
|
11
|
+
# The main class, all your crawlers will derive from this class
|
12
|
+
# see examples/standalone.rb file for an example.
|
13
|
+
#
|
14
|
+
class SiteDownloader
|
15
|
+
include Notifier
|
16
|
+
|
17
|
+
class_attribute :examiners_count, :downloaders_count
|
18
|
+
class_attribute :config
|
19
|
+
|
20
|
+
##
|
21
|
+
# Determine what will be stored in the history file,
|
22
|
+
# the default is to store the last url before the download
|
23
|
+
# so we can ignore it sooner next time.
|
24
|
+
#
|
25
|
+
# The other mode is :download, in this mode the download
|
26
|
+
# url itself will be stored, it is meant for special cases as
|
27
|
+
# the default should work better most of the time.
|
28
|
+
#
|
29
|
+
class_attribute :history_tracking
|
30
|
+
|
31
|
+
self.examiners_count = 1
|
32
|
+
self.downloaders_count = 1
|
33
|
+
|
34
|
+
self.history_tracking = :default
|
35
|
+
|
36
|
+
attr_reader :base_url, :storage, :history
|
37
|
+
|
38
|
+
##
|
39
|
+
# Create and start the crawler.
|
40
|
+
#
|
41
|
+
# @param [Hash] args arguments
|
42
|
+
# @option args [String] :base_url The root url, every other
|
43
|
+
# url will be relative to this.
|
44
|
+
# @option args [String] :start_url What is the very first url
|
45
|
+
# to examine (level 0) relative to base_url, default is "/"
|
46
|
+
# @option args [String] :folder_name The root path where
|
47
|
+
# downloaded files will be saved (appended to the storage root).
|
48
|
+
# @option args [Array] :extensions Array of Extension object.
|
49
|
+
#
|
50
|
+
# @option args [Hash] :storage Configure storage backend
|
51
|
+
# :type is the backend name
|
52
|
+
# :params is a hash with backend specific options
|
53
|
+
#
|
54
|
+
def initialize(args)
|
55
|
+
@cookies = []
|
56
|
+
@history = []
|
57
|
+
@connections = {}
|
58
|
+
@examine_queue = EM::PriorityQueue.new
|
59
|
+
@download_queue = EM::PriorityQueue.new
|
60
|
+
@history = History.new
|
61
|
+
|
62
|
+
@base_url= args.delete(:base_url)
|
63
|
+
@start_url = args.delete(:start_url) || '/'
|
64
|
+
@folder_name= args.delete(:folder_name)
|
65
|
+
|
66
|
+
# keep a pointer to each extension
|
67
|
+
@extensions = args.delete(:extensions) || [ActionLogger]
|
68
|
+
|
69
|
+
storage_options = args.delete(:storage)
|
70
|
+
raise "storage required" unless storage_options
|
71
|
+
raise "storage type required" unless storage_options[:type]
|
72
|
+
raise "storage params required" unless storage_options[:params]
|
73
|
+
|
74
|
+
storage_class = "#{storage_options[:type].camelize}Storage"
|
75
|
+
raise "unknown storage: #{storage_class}" unless defined?(storage_class)
|
76
|
+
|
77
|
+
storage_class = GetThemAll.const_get(storage_class)
|
78
|
+
|
79
|
+
storage_class = storage_class
|
80
|
+
storage_options = ActiveSupport::HashWithIndifferentAccess.new( storage_options[:params] )
|
81
|
+
|
82
|
+
@storage = storage_class.new(storage_options.merge(:folder_name => @folder_name))
|
83
|
+
|
84
|
+
@parsed_base_url = Addressable::URI.parse(@base_url)
|
85
|
+
|
86
|
+
# start_url is relative to base_url
|
87
|
+
@start_url = File.join(@base_url, @start_url)
|
88
|
+
|
89
|
+
# if any unknown option was passed, do not silently walk away
|
90
|
+
# tell the user !
|
91
|
+
unless args.empty?
|
92
|
+
raise "unknown parameters: #{args.inspect}"
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Start the crawler, if you pass a block it
|
99
|
+
# will be called after the engine is iniailized, you can
|
100
|
+
# queue the level 0 urls here and handle authenticating if needed.
|
101
|
+
#
|
102
|
+
def start
|
103
|
+
load_history()
|
104
|
+
|
105
|
+
notify('downloader.started', self)
|
106
|
+
|
107
|
+
EM::run do
|
108
|
+
EM::add_periodic_timer(5) do
|
109
|
+
if (EM::connection_count() == 0) && !@storage.working?
|
110
|
+
debug("no connections, exiting")
|
111
|
+
EM::stop_event_loop()
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
EM::error_handler do |err|
|
116
|
+
if err.is_a?(AssertionFailed)
|
117
|
+
error("Assertion failed: #{err.message}")
|
118
|
+
else
|
119
|
+
error("#{err.class}: #{err.message}")
|
120
|
+
err.backtrace.each do |line|
|
121
|
+
error(line)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
# queue the first action to start crawling
|
128
|
+
#
|
129
|
+
@examine_queue.push(ExamineAction.new(self,
|
130
|
+
:url => @start_url,
|
131
|
+
:destination_folder => '/',
|
132
|
+
:level => 0,
|
133
|
+
), 0)
|
134
|
+
|
135
|
+
|
136
|
+
# now that actions are queued, start handling them
|
137
|
+
# start each "worker"
|
138
|
+
# dequeuing is priority based, the download actions
|
139
|
+
# first and then the higher the level the higher the
|
140
|
+
# priority for examine actions, this is done this way
|
141
|
+
# to give work to the download workers asap.
|
142
|
+
#
|
143
|
+
1.upto(self.class.examiners_count) do |n|
|
144
|
+
Worker.new(:examiner, n - 1, self, @examine_queue)
|
145
|
+
end
|
146
|
+
|
147
|
+
1.upto(self.class.downloaders_count) do |n|
|
148
|
+
Worker.new(:downloader, n - 1, self, @download_queue)
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
save_history()
|
154
|
+
|
155
|
+
notify('downloader.completed', self)
|
156
|
+
end
|
157
|
+
|
158
|
+
class AssertionFailed < RuntimeError; end
|
159
|
+
|
160
|
+
def assert(cond, msg = "")
|
161
|
+
unless cond
|
162
|
+
raise AssertionFailed, msg
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# toto.com
|
167
|
+
# sub.toto.com
|
168
|
+
|
169
|
+
##
|
170
|
+
# Check if two urls are from the same domain.
|
171
|
+
#
|
172
|
+
# @return [Boolean] true if same domain
|
173
|
+
#
|
174
|
+
def same_domain?(host1, host2)
|
175
|
+
|
176
|
+
host1_parts = host1.split(".")
|
177
|
+
host2_parts = host2.split(".")
|
178
|
+
|
179
|
+
size = [host1_parts.size, host2_parts.size].min # => 2
|
180
|
+
|
181
|
+
host1_parts= host1_parts[-size..-1] if host1_parts.size > size
|
182
|
+
host2_parts= host2_parts[-size..-1] if host2_parts.size > size
|
183
|
+
|
184
|
+
ret = host1_parts.join(".") == host2_parts.join(".")
|
185
|
+
|
186
|
+
return ret
|
187
|
+
end
|
188
|
+
|
189
|
+
HISTORY_FILE_PATH = 'history.txt'.freeze
|
190
|
+
|
191
|
+
# load already downloaded pictures from disk
|
192
|
+
def load_history
|
193
|
+
if @storage.exist?(HISTORY_FILE_PATH)
|
194
|
+
data = @storage.read(HISTORY_FILE_PATH)
|
195
|
+
@history.load(data)
|
196
|
+
else
|
197
|
+
debug "History file not found: #{HISTORY_FILE_PATH}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def save_history
|
202
|
+
@storage.write(HISTORY_FILE_PATH, @history.dump)
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
# Plugin API
|
209
|
+
def open_url(url, method = "GET", params = nil, referer = nil, deferrable = nil, &block)
|
210
|
+
deferrable ||= EM::DefaultDeferrable.new
|
211
|
+
referer ||= @base_url
|
212
|
+
|
213
|
+
url = Addressable::URI.parse( (url[0...4] == "http") ? url : URI.join(@base_url, url) )
|
214
|
+
|
215
|
+
# url = (url[0...4] == "http") ? URI.parse(url) : URI.join(@base_url, url)
|
216
|
+
# url_path = url.path
|
217
|
+
|
218
|
+
# get queries with params
|
219
|
+
# if method == "GET" && url.query
|
220
|
+
# url_path << "?#{url.query}"
|
221
|
+
# end
|
222
|
+
|
223
|
+
external = !same_domain?(@parsed_base_url.host, url.host)
|
224
|
+
|
225
|
+
if external
|
226
|
+
debug("Opening external page: #{url}")
|
227
|
+
else
|
228
|
+
# debugger if url.to_s == "http://fan tasti.cc/user/pussylover75/images/image/367771"
|
229
|
+
debug("#{method.upcase} #{url}")
|
230
|
+
end
|
231
|
+
|
232
|
+
|
233
|
+
# find a connection for this host
|
234
|
+
host_key = "#{url.host}:#{url.port}"
|
235
|
+
if false
|
236
|
+
# if @connections.has_key?(host_key) && !@connections[host_key].error?
|
237
|
+
http = @connections[host_key]
|
238
|
+
else
|
239
|
+
# debug("New connection to http://#{url.host}:#{url.port}", "C")
|
240
|
+
http = EM::HttpRequest.new("http://#{url.host}:#{url.port}")
|
241
|
+
|
242
|
+
@connections[host_key] = http
|
243
|
+
end
|
244
|
+
|
245
|
+
req = http.setup_request(method.downcase.to_sym,
|
246
|
+
:path => url.path,
|
247
|
+
:query => url.query,
|
248
|
+
# :redirects => 2,
|
249
|
+
:head => {
|
250
|
+
:cookie => @cookies,
|
251
|
+
:referer => referer,
|
252
|
+
# "accept-encoding" => "gzip, compressed",
|
253
|
+
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
# req.timeout(10)
|
258
|
+
# req.errback do
|
259
|
+
# error("error while opening #{url} :(")
|
260
|
+
# end
|
261
|
+
|
262
|
+
req.callback do
|
263
|
+
case req.response_header.status
|
264
|
+
when 200
|
265
|
+
# handle cookies
|
266
|
+
unless external
|
267
|
+
# [["a=42", "PHPSESSID2=u0ctlbfrlrnus1qv8425uv4p42"], ["PHPSESSID=2jek8d61dlt134e0djft4hnn54; path=/", "OAGEO=FR%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/"]]
|
268
|
+
# [["a=42"], "PHPSESSID=p12aet71oemrfb3olffqaptss3; path=/"]
|
269
|
+
added_cookies = Array(req.cookies[1])
|
270
|
+
|
271
|
+
added_cookies.each do |str|
|
272
|
+
@cookies << str.split(';').first
|
273
|
+
end
|
274
|
+
|
275
|
+
# remove duplicates
|
276
|
+
@cookies.uniq!
|
277
|
+
|
278
|
+
# req.added_cookies.each{|key,val| @cookies[key] = val }
|
279
|
+
# req.deleted_cookies.each{|key, _| @cookies.delete(key) }
|
280
|
+
end
|
281
|
+
|
282
|
+
# debug("page loaded succesfully: #{url}")
|
283
|
+
deferrable.set_deferred_status(:succeeded, req)
|
284
|
+
if block
|
285
|
+
if block.arity == 2
|
286
|
+
doc = Hpricot(req.response)
|
287
|
+
block.call(req, doc)
|
288
|
+
else
|
289
|
+
block.call(req)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# em-http-request does not handle redirection between hosts
|
294
|
+
# so handle them ourselves
|
295
|
+
when 301, 302
|
296
|
+
location = req.response_header.location
|
297
|
+
if location
|
298
|
+
debug("Following redirection: #{location}")
|
299
|
+
# reuse the same deferrable object
|
300
|
+
open_url(location, method, params, referer, deferrable, &block)
|
301
|
+
end
|
302
|
+
|
303
|
+
else
|
304
|
+
puts "#{method} #{url} => Status: #{req.response_header.status}"
|
305
|
+
deferrable.set_deferred_status(:failed, req.response_header.http_reason)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
req.errback do
|
310
|
+
deferrable.set_deferred_status(:failed, -1)
|
311
|
+
end
|
312
|
+
|
313
|
+
deferrable
|
314
|
+
end
|
315
|
+
|
316
|
+
def eval_javascript(data)
|
317
|
+
JavascriptLoader.new(data)
|
318
|
+
end
|
319
|
+
|
320
|
+
|
321
|
+
|
322
|
+
# to be redefine in subclasses
|
323
|
+
def examine_page(doc, level)
|
324
|
+
raise "Need to implement examine_page in #{self.class}"
|
325
|
+
end
|
326
|
+
|
327
|
+
def get_file_destpath_from_action(action)
|
328
|
+
url_folder = action.uri.path
|
329
|
+
File.join(action.destination_folder, url_folder)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|