get_them_all 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,155 @@
1
+
2
+ module GetThemAll
3
+ ##
4
+ # This extension will show you a progress bar for each worker
5
+ # showing its current state:
6
+ # ~ = working
7
+ # D = url downloaded
8
+ # E = url examined
9
+ # . = url skipped
10
+ # x = url download failed
11
+ #
12
+ # The extension knowns how to handle the terminal width nicely, if
13
+ # it reachs the right end of the terminal the line will scroll
14
+ # (characters will disappear on the left while new appear on the right)
15
+ #
16
+ class GaugeDisplay < Extension
17
+ module Cursor
18
+ class << self
19
+ def up(n); print "\e[#{n}A" if n > 0; end
20
+ def down(n); print "\e[#{n}B" if n > 0; end
21
+ def right(n); print "\e[#{n}C" if n > 0; end
22
+ def left(n); print "\e[#{n}D" if n > 0; end
23
+
24
+ def col(n); print "\e[#{n}G"; end
25
+ def clear_line; print "\e[0K"; end
26
+
27
+ # save / restore
28
+ def save; print "\e[s"; end
29
+ def restore; print "\e[u"; end
30
+
31
+ # hide / show
32
+ def hide_cursor; print "\e[?25l"; end
33
+ def show_cursor; print "\e[?25h"; end
34
+
35
+ def screen_width
36
+ `tput cols`.strip.to_i
37
+ end
38
+ end
39
+ end
40
+
41
+ def initialize
42
+ register_handler('downloader.started', &method(:downloader_started))
43
+ # examine actions
44
+ register_handler('action.examine.started', &method(:work_started))
45
+ register_handler('action.examine.success', &method(:work_completed))
46
+ register_handler('action.examine.failure', &method(:work_failed))
47
+ register_handler('action.examine.skipped', &method(:work_skipped))
48
+
49
+ # download actions
50
+ register_handler('action.download.started', &method(:work_started))
51
+ register_handler('action.download.success', &method(:work_completed))
52
+ register_handler('action.download.failure', &method(:work_failed))
53
+ register_handler('action.download.skipped', &method(:work_skipped))
54
+ end
55
+
56
+
57
+ def downloader_started(event_name, downloader)
58
+ # initialize the screen
59
+ @examiners = downloader.examiners_count
60
+ @downloaders = downloader.downloaders_count
61
+
62
+ # save cursor position before doing anything
63
+ Cursor.save()
64
+
65
+ # store screen state
66
+ @state = []
67
+ (@examiners + @downloaders).times do |n|
68
+ puts "#{n} "
69
+ @state[n] = "#{n} "
70
+ end
71
+ end
72
+
73
+ def work_started(event_name, worker, action, *args)
74
+ Cursor.restore()
75
+ line = worker_line(worker)
76
+
77
+ # move to the correct line
78
+ Cursor.down( line )
79
+
80
+ # and column
81
+ Cursor.right(@state[line].size)
82
+
83
+ print "~"
84
+ end
85
+
86
+ def update_line(line, added_str)
87
+ # update internal state
88
+ state = @state[line]
89
+ state << added_str
90
+
91
+ # resize line if required
92
+ if state.size > Cursor.screen_width
93
+ header = state[0,2]
94
+ size = Cursor.screen_width - header.size
95
+ state = header + state[-size..-1]
96
+ end
97
+
98
+ # move to column 0
99
+ Cursor.col(0)
100
+ # erase entire line
101
+ Cursor.clear_line()
102
+
103
+ print state
104
+ end
105
+
106
+
107
+ def work_completed(event_name, worker, action, *args)
108
+ Cursor.restore()
109
+ line = worker_line(worker)
110
+
111
+ Cursor.down( line )
112
+ #Cursor.right(@state[line].size)
113
+
114
+ if worker.type == :examiner
115
+ update_line(line, "E")
116
+ else
117
+ update_line(line, "D")
118
+ end
119
+ end
120
+
121
+
122
+ def work_failed(event_name, worker, action, *args)
123
+ Cursor.restore()
124
+ line = worker_line(worker)
125
+
126
+ # move to the correct line
127
+ Cursor.down( line )
128
+
129
+ # and column
130
+ # Cursor.right(2 + @state[line].size)
131
+
132
+ update_line(line, "x")
133
+ end
134
+
135
+
136
+ def work_skipped(event_name, worker, action, *args)
137
+ Cursor.restore()
138
+ line = worker_line(worker)
139
+
140
+ # move to the correct line
141
+ Cursor.down( line )
142
+
143
+ # and column
144
+ # Cursor.right(2 + @state[line].size)
145
+
146
+ update_line(line, ".")
147
+ end
148
+
149
+ private
150
+ def worker_line(worker)
151
+ base_index = (worker.type == :examiner) ? 0 : @examiners
152
+ base_index + worker.index
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,138 @@
1
+
2
+ module GetThemAll
3
+ ##
4
+ # This extension will generate a textfile showing
5
+ # he hierarchy of the site which may be considered
6
+ # as a map.
7
+ #
8
+ # Its main purpose as a debugging tool to have a
9
+ # better view of what was going on inside.
10
+ #
11
+ # It was also capable of generating a dot file but this code
12
+ # is not up to date.
13
+ #
14
+ class GraphBuilder < Extension
15
+ class TreeNode
16
+ attr_reader :text, :children
17
+
18
+ def initialize(prefix, text)
19
+ @prefix = prefix
20
+ @text = text
21
+ @children = []
22
+ end
23
+
24
+ def add_child(prefix, text)
25
+ ret = TreeNode.new(prefix, text)
26
+ @children << ret
27
+ ret
28
+ end
29
+
30
+ def find_node(text)
31
+ if @text == text
32
+ self
33
+ elsif !@children.empty?
34
+ @children.map{|node| node.find_node(text) }.compact.first
35
+ else
36
+ nil
37
+ end
38
+ end
39
+
40
+ def inspect
41
+ "{#{@text}}"
42
+ end
43
+
44
+ def dump_to_file(path)
45
+ File.open(path, 'w') do |f|
46
+ f.write( dump_node(0) )
47
+ end
48
+ end
49
+
50
+ def dump_node(level)
51
+ ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
52
+ @children.each do |node|
53
+ ret << node.dump_node(level + 1)
54
+ end
55
+ ret
56
+ end
57
+ protected :dump_node
58
+
59
+
60
+
61
+
62
+ def dump_to_dot_file(path)
63
+ File.open(path, 'w') do |f|
64
+ f.write(<<-EOF)
65
+ digraph toto {
66
+ node [shape=box];
67
+ #{dump_node_dot()}
68
+ }
69
+ EOF
70
+ end
71
+ end
72
+
73
+ def dump_node_dot(prefix = nil)
74
+ # ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
75
+ if prefix.nil?
76
+ prefix = "\"#{@text}\""
77
+ else
78
+ prefix = " #{prefix} -> \"#{@text}\""
79
+ end
80
+
81
+ ret = [prefix]
82
+
83
+ @children.each do |node|
84
+ ret << node.dump_node("\"#{@text}\"")
85
+ end
86
+
87
+ ret.join("\n")
88
+ end
89
+ protected :dump_node_dot
90
+
91
+ end
92
+
93
+
94
+
95
+
96
+ ##
97
+ # @param [String] path where the resulting file will be written.
98
+ #
99
+ def initialize(path)
100
+ @path = path
101
+
102
+ register_handler('downloader.started') do |name, downloader|
103
+ @graph = TreeNode.new("", downloader.base_url)
104
+ end
105
+
106
+ register_handler('downloader.completed') do |name, worker, downloader|
107
+ @graph.dump_to_file(@path)
108
+ # @graph.dump_to_dot_file(@path)
109
+ end
110
+
111
+ register_handler('action.download.success') do |name, worker, action|
112
+ add_to_graph("D", action.url, action.parent_url)
113
+ end
114
+
115
+ register_handler('action.examine.success') do |name, worker, action, returned_actions|
116
+ add_to_graph("E[ret:#{returned_actions.size}]", action.url, action.parent_url)
117
+ end
118
+
119
+ register_handler('action.examine.failure') do |name, worker, action, error_status|
120
+ add_to_graph(error_status, action.url, action.referer)
121
+ end
122
+
123
+ end
124
+
125
+ private
126
+
127
+ def add_to_graph(prefix, url, referer)
128
+ if referer.nil?
129
+ @graph.add_child(prefix, url)
130
+ else
131
+ # find referer
132
+ parent = @graph.find_node(referer) or fail("node not found: #{referer}")
133
+ parent.add_child(prefix, url)
134
+ end
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,23 @@
1
+ module GetThemAll
2
+ class History
3
+ def initialize(data = [])
4
+ @data = data
5
+ end
6
+
7
+ def load(data)
8
+ @data = data.split("\n")
9
+ end
10
+
11
+ def dump
12
+ @data.join("\n")
13
+ end
14
+
15
+ def include?(line)
16
+ @data.include?(line)
17
+ end
18
+
19
+ def add(url)
20
+ @data << url
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,74 @@
1
+
2
+ require 'v8'
3
+
4
+ module GetThemAll
5
+ class JavascriptLoader
6
+
7
+ ##
8
+ # The goal is just to have something acting like a dom
9
+ # from the outside
10
+ class DOM
11
+ def search; self; end
12
+ def substr(*); ""; end
13
+ def protocol; 'http'; end
14
+
15
+ def location; self; end
16
+ def appendChild(*); self; end
17
+ def createElement(*); self; end
18
+ def window; self; end
19
+
20
+ def getElementsByTagName(*); [self]; end
21
+
22
+ def ready(f)
23
+ f.call()
24
+ end
25
+
26
+ def click(*); end
27
+ def change(*); end
28
+ def keydown(*); end
29
+ def keyup(*); end
30
+ def setInterval(*); end
31
+ def setTimeout(*); end
32
+ def attr(*); self; end
33
+ def text(*); self; end
34
+ def animate(*); self; end
35
+ def empty(*); self; end
36
+ def hide(*); self; end
37
+ def show(*); self; end
38
+ def focus(*); self; end
39
+ end
40
+
41
+ class JQuery
42
+
43
+ end
44
+
45
+ def initialize(source)
46
+ @context = V8::Context.new do |ctx|
47
+ ctx[:document] = DOM.new()
48
+ ctx[:window] = DOM.new()
49
+ ctx[:jQuery] = JQuery.new
50
+ ctx['setInterval'] = ctx[:window].method(:setInterval)
51
+ ctx['setTimeout'] = ctx[:window].method(:setTimeout)
52
+ end
53
+
54
+ @context.eval(%{
55
+ $ = function(){
56
+ return document;
57
+ };
58
+
59
+ $.cookie = function(){};
60
+ $.each = function(){};
61
+
62
+ // hash method cannot be defined on ruby side...
63
+ window.location.hash = window;
64
+ })
65
+
66
+ @context.eval(source)
67
+ end
68
+
69
+ def eval(str)
70
+ @context.eval(str)
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,35 @@
1
+ require 'logger'
2
+
3
+ module GetThemAll
4
+ class SiteDownloaderLogger < Logger
5
+ def format_message(level, time, progname, msg)
6
+ "[#{level} -- #{time.strftime("%Y/%m/%d %H:%M:%S")}] #{msg}\n"
7
+ end
8
+ end
9
+
10
+ class SiteDownloader
11
+ cattr_accessor :logger
12
+
13
+ self.logger= SiteDownloaderLogger.new(STDOUT)
14
+ self.logger.level = Logger::ERROR
15
+ end
16
+ end
17
+
18
+ # helpers
19
+ def debug(msg, short_msg = nil)
20
+ GetThemAll::SiteDownloader::logger.debug(msg)
21
+ end
22
+
23
+ def info(msg)
24
+ GetThemAll::SiteDownloader::logger.info(msg)
25
+ end
26
+
27
+ def warn(msg)
28
+ GetThemAll::SiteDownloader::logger.warn(msg)
29
+ end
30
+
31
+ def error(msg, fatal = false)
32
+ GetThemAll::SiteDownloader::logger.error(msg)
33
+ exit(1) if fatal
34
+ end
35
+
@@ -0,0 +1,7 @@
1
+ module GetThemAll
2
+ module Notifier
3
+ def notify(name, *args)
4
+ ActiveSupport::Notifications.publish(name, *args)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,332 @@
1
+
2
+ require 'addressable/uri'
3
+ require 'active_support/hash_with_indifferent_access'
4
+
5
+ require 'em-http-request'
6
+ require 'em-priority-queue'
7
+
8
+ module GetThemAll
9
+
10
+ ##
11
+ # The main class, all your crawlers will derive from this class
12
+ # see examples/standalone.rb file for an example.
13
+ #
14
+ class SiteDownloader
15
+ include Notifier
16
+
17
+ class_attribute :examiners_count, :downloaders_count
18
+ class_attribute :config
19
+
20
+ ##
21
+ # Determine what will be stored in the history file,
22
+ # the default is to store the last url before the download
23
+ # so we can ignore it sooner next time.
24
+ #
25
+ # The other mode is :download, in this mode the download
26
+ # url itself will be stored, it is meant for special cases as
27
+ # the default should work better most of the time.
28
+ #
29
+ class_attribute :history_tracking
30
+
31
+ self.examiners_count = 1
32
+ self.downloaders_count = 1
33
+
34
+ self.history_tracking = :default
35
+
36
+ attr_reader :base_url, :storage, :history
37
+
38
+ ##
39
+ # Create and start the crawler.
40
+ #
41
+ # @param [Hash] args arguments
42
+ # @option args [String] :base_url The root url, every other
43
+ # url will be relative to this.
44
+ # @option args [String] :start_url What is the very first url
45
+ # to examine (level 0) relative to base_url, default is "/"
46
+ # @option args [String] :folder_name The root path where
47
+ # downloaded files will be saved (appended to the storage root).
48
+ # @option args [Array] :extensions Array of Extension object.
49
+ #
50
+ # @option args [Hash] :storage Configure storage backend
51
+ # :type is the backend name
52
+ # :params is a hash with backend specific options
53
+ #
54
+ def initialize(args)
55
+ @cookies = []
56
+ @history = []
57
+ @connections = {}
58
+ @examine_queue = EM::PriorityQueue.new
59
+ @download_queue = EM::PriorityQueue.new
60
+ @history = History.new
61
+
62
+ @base_url= args.delete(:base_url)
63
+ @start_url = args.delete(:start_url) || '/'
64
+ @folder_name= args.delete(:folder_name)
65
+
66
+ # keep a pointer to each extension
67
+ @extensions = args.delete(:extensions) || [ActionLogger]
68
+
69
+ storage_options = args.delete(:storage)
70
+ raise "storage required" unless storage_options
71
+ raise "storage type required" unless storage_options[:type]
72
+ raise "storage params required" unless storage_options[:params]
73
+
74
+ storage_class = "#{storage_options[:type].camelize}Storage"
75
+ raise "unknown storage: #{storage_class}" unless defined?(storage_class)
76
+
77
+ storage_class = GetThemAll.const_get(storage_class)
78
+
79
+ storage_class = storage_class
80
+ storage_options = ActiveSupport::HashWithIndifferentAccess.new( storage_options[:params] )
81
+
82
+ @storage = storage_class.new(storage_options.merge(:folder_name => @folder_name))
83
+
84
+ @parsed_base_url = Addressable::URI.parse(@base_url)
85
+
86
+ # start_url is relative to base_url
87
+ @start_url = File.join(@base_url, @start_url)
88
+
89
+ # if any unknown option was passed, do not silently walk away
90
+ # tell the user !
91
+ unless args.empty?
92
+ raise "unknown parameters: #{args.inspect}"
93
+ end
94
+
95
+ end
96
+
97
+ ##
98
+ # Start the crawler, if you pass a block it
99
+ # will be called after the engine is iniailized, you can
100
+ # queue the level 0 urls here and handle authenticating if needed.
101
+ #
102
+ def start
103
+ load_history()
104
+
105
+ notify('downloader.started', self)
106
+
107
+ EM::run do
108
+ EM::add_periodic_timer(5) do
109
+ if (EM::connection_count() == 0) && !@storage.working?
110
+ debug("no connections, exiting")
111
+ EM::stop_event_loop()
112
+ end
113
+ end
114
+
115
+ EM::error_handler do |err|
116
+ if err.is_a?(AssertionFailed)
117
+ error("Assertion failed: #{err.message}")
118
+ else
119
+ error("#{err.class}: #{err.message}")
120
+ err.backtrace.each do |line|
121
+ error(line)
122
+ end
123
+ end
124
+ end
125
+
126
+
127
+ # queue the first action to start crawling
128
+ #
129
+ @examine_queue.push(ExamineAction.new(self,
130
+ :url => @start_url,
131
+ :destination_folder => '/',
132
+ :level => 0,
133
+ ), 0)
134
+
135
+
136
+ # now that actions are queued, start handling them
137
+ # start each "worker"
138
+ # dequeuing is priority based, the download actions
139
+ # first and then the higher the level the higher the
140
+ # priority for examine actions, this is done this way
141
+ # to give work to the download workers asap.
142
+ #
143
+ 1.upto(self.class.examiners_count) do |n|
144
+ Worker.new(:examiner, n - 1, self, @examine_queue)
145
+ end
146
+
147
+ 1.upto(self.class.downloaders_count) do |n|
148
+ Worker.new(:downloader, n - 1, self, @download_queue)
149
+ end
150
+
151
+ end
152
+
153
+ save_history()
154
+
155
+ notify('downloader.completed', self)
156
+ end
157
+
158
+ class AssertionFailed < RuntimeError; end
159
+
160
+ def assert(cond, msg = "")
161
+ unless cond
162
+ raise AssertionFailed, msg
163
+ end
164
+ end
165
+
166
+ # toto.com
167
+ # sub.toto.com
168
+
169
+ ##
170
+ # Check if two urls are from the same domain.
171
+ #
172
+ # @return [Boolean] true if same domain
173
+ #
174
+ def same_domain?(host1, host2)
175
+
176
+ host1_parts = host1.split(".")
177
+ host2_parts = host2.split(".")
178
+
179
+ size = [host1_parts.size, host2_parts.size].min # => 2
180
+
181
+ host1_parts= host1_parts[-size..-1] if host1_parts.size > size
182
+ host2_parts= host2_parts[-size..-1] if host2_parts.size > size
183
+
184
+ ret = host1_parts.join(".") == host2_parts.join(".")
185
+
186
+ return ret
187
+ end
188
+
189
+ HISTORY_FILE_PATH = 'history.txt'.freeze
190
+
191
+ # load already downloaded pictures from disk
192
+ def load_history
193
+ if @storage.exist?(HISTORY_FILE_PATH)
194
+ data = @storage.read(HISTORY_FILE_PATH)
195
+ @history.load(data)
196
+ else
197
+ debug "History file not found: #{HISTORY_FILE_PATH}"
198
+ end
199
+ end
200
+
201
+ def save_history
202
+ @storage.write(HISTORY_FILE_PATH, @history.dump)
203
+ end
204
+
205
+
206
+
207
+
208
+ # Plugin API
209
+ def open_url(url, method = "GET", params = nil, referer = nil, deferrable = nil, &block)
210
+ deferrable ||= EM::DefaultDeferrable.new
211
+ referer ||= @base_url
212
+
213
+ url = Addressable::URI.parse( (url[0...4] == "http") ? url : URI.join(@base_url, url) )
214
+
215
+ # url = (url[0...4] == "http") ? URI.parse(url) : URI.join(@base_url, url)
216
+ # url_path = url.path
217
+
218
+ # get queries with params
219
+ # if method == "GET" && url.query
220
+ # url_path << "?#{url.query}"
221
+ # end
222
+
223
+ external = !same_domain?(@parsed_base_url.host, url.host)
224
+
225
+ if external
226
+ debug("Opening external page: #{url}")
227
+ else
228
+ # debugger if url.to_s == "http://fan tasti.cc/user/pussylover75/images/image/367771"
229
+ debug("#{method.upcase} #{url}")
230
+ end
231
+
232
+
233
+ # find a connection for this host
234
+ host_key = "#{url.host}:#{url.port}"
235
+ if false
236
+ # if @connections.has_key?(host_key) && !@connections[host_key].error?
237
+ http = @connections[host_key]
238
+ else
239
+ # debug("New connection to http://#{url.host}:#{url.port}", "C")
240
+ http = EM::HttpRequest.new("http://#{url.host}:#{url.port}")
241
+
242
+ @connections[host_key] = http
243
+ end
244
+
245
+ req = http.setup_request(method.downcase.to_sym,
246
+ :path => url.path,
247
+ :query => url.query,
248
+ # :redirects => 2,
249
+ :head => {
250
+ :cookie => @cookies,
251
+ :referer => referer,
252
+ # "accept-encoding" => "gzip, compressed",
253
+ 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
254
+ }
255
+ )
256
+
257
+ # req.timeout(10)
258
+ # req.errback do
259
+ # error("error while opening #{url} :(")
260
+ # end
261
+
262
+ req.callback do
263
+ case req.response_header.status
264
+ when 200
265
+ # handle cookies
266
+ unless external
267
+ # [["a=42", "PHPSESSID2=u0ctlbfrlrnus1qv8425uv4p42"], ["PHPSESSID=2jek8d61dlt134e0djft4hnn54; path=/", "OAGEO=FR%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/"]]
268
+ # [["a=42"], "PHPSESSID=p12aet71oemrfb3olffqaptss3; path=/"]
269
+ added_cookies = Array(req.cookies[1])
270
+
271
+ added_cookies.each do |str|
272
+ @cookies << str.split(';').first
273
+ end
274
+
275
+ # remove duplicates
276
+ @cookies.uniq!
277
+
278
+ # req.added_cookies.each{|key,val| @cookies[key] = val }
279
+ # req.deleted_cookies.each{|key, _| @cookies.delete(key) }
280
+ end
281
+
282
+ # debug("page loaded succesfully: #{url}")
283
+ deferrable.set_deferred_status(:succeeded, req)
284
+ if block
285
+ if block.arity == 2
286
+ doc = Hpricot(req.response)
287
+ block.call(req, doc)
288
+ else
289
+ block.call(req)
290
+ end
291
+ end
292
+
293
+ # em-http-request does not handle redirection between hosts
294
+ # so handle them ourselves
295
+ when 301, 302
296
+ location = req.response_header.location
297
+ if location
298
+ debug("Following redirection: #{location}")
299
+ # reuse the same deferrable object
300
+ open_url(location, method, params, referer, deferrable, &block)
301
+ end
302
+
303
+ else
304
+ puts "#{method} #{url} => Status: #{req.response_header.status}"
305
+ deferrable.set_deferred_status(:failed, req.response_header.http_reason)
306
+ end
307
+ end
308
+
309
+ req.errback do
310
+ deferrable.set_deferred_status(:failed, -1)
311
+ end
312
+
313
+ deferrable
314
+ end
315
+
316
+ def eval_javascript(data)
317
+ JavascriptLoader.new(data)
318
+ end
319
+
320
+
321
+
322
+ # to be redefine in subclasses
323
+ def examine_page(doc, level)
324
+ raise "Need to implement examine_page in #{self.class}"
325
+ end
326
+
327
+ def get_file_destpath_from_action(action)
328
+ url_folder = action.uri.path
329
+ File.join(action.destination_folder, url_folder)
330
+ end
331
+ end
332
+ end