get_them_all 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+
2
+ module GetThemAll
3
+ ##
4
+ # This extension will show you a progress bar for each worker
5
+ # showing its current state:
6
+ # ~ = working
7
+ # D = url downloaded
8
+ # E = url examined
9
+ # . = url skipped
10
+ # x = url download failed
11
+ #
12
+ # The extension knowns how to handle the terminal width nicely, if
13
+ # it reachs the right end of the terminal the line will scroll
14
+ # (characters will disappear on the left while new appear on the right)
15
+ #
16
+ class GaugeDisplay < Extension
17
+ module Cursor
18
+ class << self
19
+ def up(n); print "\e[#{n}A" if n > 0; end
20
+ def down(n); print "\e[#{n}B" if n > 0; end
21
+ def right(n); print "\e[#{n}C" if n > 0; end
22
+ def left(n); print "\e[#{n}D" if n > 0; end
23
+
24
+ def col(n); print "\e[#{n}G"; end
25
+ def clear_line; print "\e[0K"; end
26
+
27
+ # save / restore
28
+ def save; print "\e[s"; end
29
+ def restore; print "\e[u"; end
30
+
31
+ # hide / show
32
+ def hide_cursor; print "\e[?25l"; end
33
+ def show_cursor; print "\e[?25h"; end
34
+
35
+ def screen_width
36
+ `tput cols`.strip.to_i
37
+ end
38
+ end
39
+ end
40
+
41
+ def initialize
42
+ register_handler('downloader.started', &method(:downloader_started))
43
+ # examine actions
44
+ register_handler('action.examine.started', &method(:work_started))
45
+ register_handler('action.examine.success', &method(:work_completed))
46
+ register_handler('action.examine.failure', &method(:work_failed))
47
+ register_handler('action.examine.skipped', &method(:work_skipped))
48
+
49
+ # download actions
50
+ register_handler('action.download.started', &method(:work_started))
51
+ register_handler('action.download.success', &method(:work_completed))
52
+ register_handler('action.download.failure', &method(:work_failed))
53
+ register_handler('action.download.skipped', &method(:work_skipped))
54
+ end
55
+
56
+
57
+ def downloader_started(event_name, downloader)
58
+ # initialize the screen
59
+ @examiners = downloader.examiners_count
60
+ @downloaders = downloader.downloaders_count
61
+
62
+ # save cursor position before doing anything
63
+ Cursor.save()
64
+
65
+ # store screen state
66
+ @state = []
67
+ (@examiners + @downloaders).times do |n|
68
+ puts "#{n} "
69
+ @state[n] = "#{n} "
70
+ end
71
+ end
72
+
73
+ def work_started(event_name, worker, action, *args)
74
+ Cursor.restore()
75
+ line = worker_line(worker)
76
+
77
+ # move to the correct line
78
+ Cursor.down( line )
79
+
80
+ # and column
81
+ Cursor.right(@state[line].size)
82
+
83
+ print "~"
84
+ end
85
+
86
+ def update_line(line, added_str)
87
+ # update internal state
88
+ state = @state[line]
89
+ state << added_str
90
+
91
+ # resize line if required
92
+ if state.size > Cursor.screen_width
93
+ header = state[0,2]
94
+ size = Cursor.screen_width - header.size
95
+ state = header + state[-size..-1]
96
+ end
97
+
98
+ # move to column 0
99
+ Cursor.col(0)
100
+ # erase entire line
101
+ Cursor.clear_line()
102
+
103
+ print state
104
+ end
105
+
106
+
107
+ def work_completed(event_name, worker, action, *args)
108
+ Cursor.restore()
109
+ line = worker_line(worker)
110
+
111
+ Cursor.down( line )
112
+ #Cursor.right(@state[line].size)
113
+
114
+ if worker.type == :examiner
115
+ update_line(line, "E")
116
+ else
117
+ update_line(line, "D")
118
+ end
119
+ end
120
+
121
+
122
+ def work_failed(event_name, worker, action, *args)
123
+ Cursor.restore()
124
+ line = worker_line(worker)
125
+
126
+ # move to the correct line
127
+ Cursor.down( line )
128
+
129
+ # and column
130
+ # Cursor.right(2 + @state[line].size)
131
+
132
+ update_line(line, "x")
133
+ end
134
+
135
+
136
+ def work_skipped(event_name, worker, action, *args)
137
+ Cursor.restore()
138
+ line = worker_line(worker)
139
+
140
+ # move to the correct line
141
+ Cursor.down( line )
142
+
143
+ # and column
144
+ # Cursor.right(2 + @state[line].size)
145
+
146
+ update_line(line, ".")
147
+ end
148
+
149
+ private
150
+ def worker_line(worker)
151
+ base_index = (worker.type == :examiner) ? 0 : @examiners
152
+ base_index + worker.index
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,138 @@
1
+
2
+ module GetThemAll
3
+ ##
4
+ # This extension will generate a textfile showing
5
+ # he hierarchy of the site which may be considered
6
+ # as a map.
7
+ #
8
+ # Its main purpose as a debugging tool to have a
9
+ # better view of what was going on inside.
10
+ #
11
+ # It was also capable of generating a dot file but this code
12
+ # is not up to date.
13
+ #
14
+ class GraphBuilder < Extension
15
+ class TreeNode
16
+ attr_reader :text, :children
17
+
18
+ def initialize(prefix, text)
19
+ @prefix = prefix
20
+ @text = text
21
+ @children = []
22
+ end
23
+
24
+ def add_child(prefix, text)
25
+ ret = TreeNode.new(prefix, text)
26
+ @children << ret
27
+ ret
28
+ end
29
+
30
+ def find_node(text)
31
+ if @text == text
32
+ self
33
+ elsif !@children.empty?
34
+ @children.map{|node| node.find_node(text) }.compact.first
35
+ else
36
+ nil
37
+ end
38
+ end
39
+
40
+ def inspect
41
+ "{#{@text}}"
42
+ end
43
+
44
+ def dump_to_file(path)
45
+ File.open(path, 'w') do |f|
46
+ f.write( dump_node(0) )
47
+ end
48
+ end
49
+
50
+ def dump_node(level)
51
+ ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
52
+ @children.each do |node|
53
+ ret << node.dump_node(level + 1)
54
+ end
55
+ ret
56
+ end
57
+ protected :dump_node
58
+
59
+
60
+
61
+
62
+ def dump_to_dot_file(path)
63
+ File.open(path, 'w') do |f|
64
+ f.write(<<-EOF)
65
+ digraph toto {
66
+ node [shape=box];
67
+ #{dump_node_dot()}
68
+ }
69
+ EOF
70
+ end
71
+ end
72
+
73
+ def dump_node_dot(prefix = nil)
74
+ # ret = "#{level-1} #{' ' * level}#{@prefix} #{@text}\n"
75
+ if prefix.nil?
76
+ prefix = "\"#{@text}\""
77
+ else
78
+ prefix = " #{prefix} -> \"#{@text}\""
79
+ end
80
+
81
+ ret = [prefix]
82
+
83
+ @children.each do |node|
84
+ ret << node.dump_node("\"#{@text}\"")
85
+ end
86
+
87
+ ret.join("\n")
88
+ end
89
+ protected :dump_node_dot
90
+
91
+ end
92
+
93
+
94
+
95
+
96
+ ##
97
+ # @param [String] path where the resulting file will be written.
98
+ #
99
+ def initialize(path)
100
+ @path = path
101
+
102
+ register_handler('downloader.started') do |name, downloader|
103
+ @graph = TreeNode.new("", downloader.base_url)
104
+ end
105
+
106
+ register_handler('downloader.completed') do |name, worker, downloader|
107
+ @graph.dump_to_file(@path)
108
+ # @graph.dump_to_dot_file(@path)
109
+ end
110
+
111
+ register_handler('action.download.success') do |name, worker, action|
112
+ add_to_graph("D", action.url, action.parent_url)
113
+ end
114
+
115
+ register_handler('action.examine.success') do |name, worker, action, returned_actions|
116
+ add_to_graph("E[ret:#{returned_actions.size}]", action.url, action.parent_url)
117
+ end
118
+
119
+ register_handler('action.examine.failure') do |name, worker, action, error_status|
120
+ add_to_graph(error_status, action.url, action.referer)
121
+ end
122
+
123
+ end
124
+
125
+ private
126
+
127
+ def add_to_graph(prefix, url, referer)
128
+ if referer.nil?
129
+ @graph.add_child(prefix, url)
130
+ else
131
+ # find referer
132
+ parent = @graph.find_node(referer) or fail("node not found: #{referer}")
133
+ parent.add_child(prefix, url)
134
+ end
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,23 @@
1
+ module GetThemAll
2
+ class History
3
+ def initialize(data = [])
4
+ @data = data
5
+ end
6
+
7
+ def load(data)
8
+ @data = data.split("\n")
9
+ end
10
+
11
+ def dump
12
+ @data.join("\n")
13
+ end
14
+
15
+ def include?(line)
16
+ @data.include?(line)
17
+ end
18
+
19
+ def add(url)
20
+ @data << url
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,74 @@
1
+
2
+ require 'v8'
3
+
4
+ module GetThemAll
5
+ class JavascriptLoader
6
+
7
+ ##
8
+ # The goal is just to have something acting like a dom
9
+ # from the outside
10
+ class DOM
11
+ def search; self; end
12
+ def substr(*); ""; end
13
+ def protocol; 'http'; end
14
+
15
+ def location; self; end
16
+ def appendChild(*); self; end
17
+ def createElement(*); self; end
18
+ def window; self; end
19
+
20
+ def getElementsByTagName(*); [self]; end
21
+
22
+ def ready(f)
23
+ f.call()
24
+ end
25
+
26
+ def click(*); end
27
+ def change(*); end
28
+ def keydown(*); end
29
+ def keyup(*); end
30
+ def setInterval(*); end
31
+ def setTimeout(*); end
32
+ def attr(*); self; end
33
+ def text(*); self; end
34
+ def animate(*); self; end
35
+ def empty(*); self; end
36
+ def hide(*); self; end
37
+ def show(*); self; end
38
+ def focus(*); self; end
39
+ end
40
+
41
+ class JQuery
42
+
43
+ end
44
+
45
+ def initialize(source)
46
+ @context = V8::Context.new do |ctx|
47
+ ctx[:document] = DOM.new()
48
+ ctx[:window] = DOM.new()
49
+ ctx[:jQuery] = JQuery.new
50
+ ctx['setInterval'] = ctx[:window].method(:setInterval)
51
+ ctx['setTimeout'] = ctx[:window].method(:setTimeout)
52
+ end
53
+
54
+ @context.eval(%{
55
+ $ = function(){
56
+ return document;
57
+ };
58
+
59
+ $.cookie = function(){};
60
+ $.each = function(){};
61
+
62
+ // hash method cannot be defined on ruby side...
63
+ window.location.hash = window;
64
+ })
65
+
66
+ @context.eval(source)
67
+ end
68
+
69
+ def eval(str)
70
+ @context.eval(str)
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,35 @@
1
+ require 'logger'
2
+
3
+ module GetThemAll
4
+ class SiteDownloaderLogger < Logger
5
+ def format_message(level, time, progname, msg)
6
+ "[#{level} -- #{time.strftime("%Y/%m/%d %H:%M:%S")}] #{msg}\n"
7
+ end
8
+ end
9
+
10
+ class SiteDownloader
11
+ cattr_accessor :logger
12
+
13
+ self.logger= SiteDownloaderLogger.new(STDOUT)
14
+ self.logger.level = Logger::ERROR
15
+ end
16
+ end
17
+
18
+ # helpers
19
+ def debug(msg, short_msg = nil)
20
+ GetThemAll::SiteDownloader::logger.debug(msg)
21
+ end
22
+
23
+ def info(msg)
24
+ GetThemAll::SiteDownloader::logger.info(msg)
25
+ end
26
+
27
+ def warn(msg)
28
+ GetThemAll::SiteDownloader::logger.warn(msg)
29
+ end
30
+
31
+ def error(msg, fatal = false)
32
+ GetThemAll::SiteDownloader::logger.error(msg)
33
+ exit(1) if fatal
34
+ end
35
+
@@ -0,0 +1,7 @@
1
+ module GetThemAll
2
+ module Notifier
3
+ def notify(name, *args)
4
+ ActiveSupport::Notifications.publish(name, *args)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,332 @@
1
+
2
+ require 'addressable/uri'
3
+ require 'active_support/hash_with_indifferent_access'
4
+
5
+ require 'em-http-request'
6
+ require 'em-priority-queue'
7
+
8
+ module GetThemAll
9
+
10
+ ##
11
+ # The main class, all your crawlers will derive from this class
12
+ # see examples/standalone.rb file for an example.
13
+ #
14
+ class SiteDownloader
15
+ include Notifier
16
+
17
+ class_attribute :examiners_count, :downloaders_count
18
+ class_attribute :config
19
+
20
+ ##
21
+ # Determine what will be stored in the history file,
22
+ # the default is to store the last url before the download
23
+ # so we can ignore it sooner next time.
24
+ #
25
+ # The other mode is :download, in this mode the download
26
+ # url itself will be stored, it is meant for special cases as
27
+ # the default should work better most of the time.
28
+ #
29
+ class_attribute :history_tracking
30
+
31
+ self.examiners_count = 1
32
+ self.downloaders_count = 1
33
+
34
+ self.history_tracking = :default
35
+
36
+ attr_reader :base_url, :storage, :history
37
+
38
+ ##
39
+ # Create and start the crawler.
40
+ #
41
+ # @param [Hash] args arguments
42
+ # @option args [String] :base_url The root url, every other
43
+ # url will be relative to this.
44
+ # @option args [String] :start_url What is the very first url
45
+ # to examine (level 0) relative to base_url, default is "/"
46
+ # @option args [String] :folder_name The root path where
47
+ # downloaded files will be saved (appended to the storage root).
48
+ # @option args [Array] :extensions Array of Extension object.
49
+ #
50
+ # @option args [Hash] :storage Configure storage backend
51
+ # :type is the backend name
52
+ # :params is a hash with backend specific options
53
+ #
54
+ def initialize(args)
55
+ @cookies = []
56
+ @history = []
57
+ @connections = {}
58
+ @examine_queue = EM::PriorityQueue.new
59
+ @download_queue = EM::PriorityQueue.new
60
+ @history = History.new
61
+
62
+ @base_url= args.delete(:base_url)
63
+ @start_url = args.delete(:start_url) || '/'
64
+ @folder_name= args.delete(:folder_name)
65
+
66
+ # keep a pointer to each extension
67
+ @extensions = args.delete(:extensions) || [ActionLogger]
68
+
69
+ storage_options = args.delete(:storage)
70
+ raise "storage required" unless storage_options
71
+ raise "storage type required" unless storage_options[:type]
72
+ raise "storage params required" unless storage_options[:params]
73
+
74
+ storage_class = "#{storage_options[:type].camelize}Storage"
75
+ raise "unknown storage: #{storage_class}" unless defined?(storage_class)
76
+
77
+ storage_class = GetThemAll.const_get(storage_class)
78
+
79
+ storage_class = storage_class
80
+ storage_options = ActiveSupport::HashWithIndifferentAccess.new( storage_options[:params] )
81
+
82
+ @storage = storage_class.new(storage_options.merge(:folder_name => @folder_name))
83
+
84
+ @parsed_base_url = Addressable::URI.parse(@base_url)
85
+
86
+ # start_url is relative to base_url
87
+ @start_url = File.join(@base_url, @start_url)
88
+
89
+ # if any unknown option was passed, do not silently walk away
90
+ # tell the user !
91
+ unless args.empty?
92
+ raise "unknown parameters: #{args.inspect}"
93
+ end
94
+
95
+ end
96
+
97
+ ##
98
+ # Start the crawler, if you pass a block it
99
+ # will be called after the engine is iniailized, you can
100
+ # queue the level 0 urls here and handle authenticating if needed.
101
+ #
102
+ def start
103
+ load_history()
104
+
105
+ notify('downloader.started', self)
106
+
107
+ EM::run do
108
+ EM::add_periodic_timer(5) do
109
+ if (EM::connection_count() == 0) && !@storage.working?
110
+ debug("no connections, exiting")
111
+ EM::stop_event_loop()
112
+ end
113
+ end
114
+
115
+ EM::error_handler do |err|
116
+ if err.is_a?(AssertionFailed)
117
+ error("Assertion failed: #{err.message}")
118
+ else
119
+ error("#{err.class}: #{err.message}")
120
+ err.backtrace.each do |line|
121
+ error(line)
122
+ end
123
+ end
124
+ end
125
+
126
+
127
+ # queue the first action to start crawling
128
+ #
129
+ @examine_queue.push(ExamineAction.new(self,
130
+ :url => @start_url,
131
+ :destination_folder => '/',
132
+ :level => 0,
133
+ ), 0)
134
+
135
+
136
+ # now that actions are queued, start handling them
137
+ # start each "worker"
138
+ # dequeuing is priority based, the download actions
139
+ # first and then the higher the level the higher the
140
+ # priority for examine actions, this is done this way
141
+ # to give work to the download workers asap.
142
+ #
143
+ 1.upto(self.class.examiners_count) do |n|
144
+ Worker.new(:examiner, n - 1, self, @examine_queue)
145
+ end
146
+
147
+ 1.upto(self.class.downloaders_count) do |n|
148
+ Worker.new(:downloader, n - 1, self, @download_queue)
149
+ end
150
+
151
+ end
152
+
153
+ save_history()
154
+
155
+ notify('downloader.completed', self)
156
+ end
157
+
158
+ class AssertionFailed < RuntimeError; end
159
+
160
+ def assert(cond, msg = "")
161
+ unless cond
162
+ raise AssertionFailed, msg
163
+ end
164
+ end
165
+
166
+ # toto.com
167
+ # sub.toto.com
168
+
169
+ ##
170
+ # Check if two urls are from the same domain.
171
+ #
172
+ # @return [Boolean] true if same domain
173
+ #
174
+ def same_domain?(host1, host2)
175
+
176
+ host1_parts = host1.split(".")
177
+ host2_parts = host2.split(".")
178
+
179
+ size = [host1_parts.size, host2_parts.size].min # => 2
180
+
181
+ host1_parts= host1_parts[-size..-1] if host1_parts.size > size
182
+ host2_parts= host2_parts[-size..-1] if host2_parts.size > size
183
+
184
+ ret = host1_parts.join(".") == host2_parts.join(".")
185
+
186
+ return ret
187
+ end
188
+
189
+ HISTORY_FILE_PATH = 'history.txt'.freeze
190
+
191
+ # load already downloaded pictures from disk
192
+ def load_history
193
+ if @storage.exist?(HISTORY_FILE_PATH)
194
+ data = @storage.read(HISTORY_FILE_PATH)
195
+ @history.load(data)
196
+ else
197
+ debug "History file not found: #{HISTORY_FILE_PATH}"
198
+ end
199
+ end
200
+
201
+ def save_history
202
+ @storage.write(HISTORY_FILE_PATH, @history.dump)
203
+ end
204
+
205
+
206
+
207
+
208
+ # Plugin API
209
+ def open_url(url, method = "GET", params = nil, referer = nil, deferrable = nil, &block)
210
+ deferrable ||= EM::DefaultDeferrable.new
211
+ referer ||= @base_url
212
+
213
+ url = Addressable::URI.parse( (url[0...4] == "http") ? url : URI.join(@base_url, url) )
214
+
215
+ # url = (url[0...4] == "http") ? URI.parse(url) : URI.join(@base_url, url)
216
+ # url_path = url.path
217
+
218
+ # get queries with params
219
+ # if method == "GET" && url.query
220
+ # url_path << "?#{url.query}"
221
+ # end
222
+
223
+ external = !same_domain?(@parsed_base_url.host, url.host)
224
+
225
+ if external
226
+ debug("Opening external page: #{url}")
227
+ else
228
+ # debugger if url.to_s == "http://fan tasti.cc/user/pussylover75/images/image/367771"
229
+ debug("#{method.upcase} #{url}")
230
+ end
231
+
232
+
233
+ # find a connection for this host
234
+ host_key = "#{url.host}:#{url.port}"
235
+ if false
236
+ # if @connections.has_key?(host_key) && !@connections[host_key].error?
237
+ http = @connections[host_key]
238
+ else
239
+ # debug("New connection to http://#{url.host}:#{url.port}", "C")
240
+ http = EM::HttpRequest.new("http://#{url.host}:#{url.port}")
241
+
242
+ @connections[host_key] = http
243
+ end
244
+
245
+ req = http.setup_request(method.downcase.to_sym,
246
+ :path => url.path,
247
+ :query => url.query,
248
+ # :redirects => 2,
249
+ :head => {
250
+ :cookie => @cookies,
251
+ :referer => referer,
252
+ # "accept-encoding" => "gzip, compressed",
253
+ 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
254
+ }
255
+ )
256
+
257
+ # req.timeout(10)
258
+ # req.errback do
259
+ # error("error while opening #{url} :(")
260
+ # end
261
+
262
+ req.callback do
263
+ case req.response_header.status
264
+ when 200
265
+ # handle cookies
266
+ unless external
267
+ # [["a=42", "PHPSESSID2=u0ctlbfrlrnus1qv8425uv4p42"], ["PHPSESSID=2jek8d61dlt134e0djft4hnn54; path=/", "OAGEO=FR%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/", "OAID=924ff65ed90c7834d8b37b29bdffc831; expires=Sun, 14-Oct-2012 12:22:22 GMT; path=/"]]
268
+ # [["a=42"], "PHPSESSID=p12aet71oemrfb3olffqaptss3; path=/"]
269
+ added_cookies = Array(req.cookies[1])
270
+
271
+ added_cookies.each do |str|
272
+ @cookies << str.split(';').first
273
+ end
274
+
275
+ # remove duplicates
276
+ @cookies.uniq!
277
+
278
+ # req.added_cookies.each{|key,val| @cookies[key] = val }
279
+ # req.deleted_cookies.each{|key, _| @cookies.delete(key) }
280
+ end
281
+
282
+ # debug("page loaded succesfully: #{url}")
283
+ deferrable.set_deferred_status(:succeeded, req)
284
+ if block
285
+ if block.arity == 2
286
+ doc = Hpricot(req.response)
287
+ block.call(req, doc)
288
+ else
289
+ block.call(req)
290
+ end
291
+ end
292
+
293
+ # em-http-request does not handle redirection between hosts
294
+ # so handle them ourselves
295
+ when 301, 302
296
+ location = req.response_header.location
297
+ if location
298
+ debug("Following redirection: #{location}")
299
+ # reuse the same deferrable object
300
+ open_url(location, method, params, referer, deferrable, &block)
301
+ end
302
+
303
+ else
304
+ puts "#{method} #{url} => Status: #{req.response_header.status}"
305
+ deferrable.set_deferred_status(:failed, req.response_header.http_reason)
306
+ end
307
+ end
308
+
309
+ req.errback do
310
+ deferrable.set_deferred_status(:failed, -1)
311
+ end
312
+
313
+ deferrable
314
+ end
315
+
316
+ def eval_javascript(data)
317
+ JavascriptLoader.new(data)
318
+ end
319
+
320
+
321
+
322
+ # to be redefine in subclasses
323
+ def examine_page(doc, level)
324
+ raise "Need to implement examine_page in #{self.class}"
325
+ end
326
+
327
+ def get_file_destpath_from_action(action)
328
+ url_folder = action.uri.path
329
+ File.join(action.destination_folder, url_folder)
330
+ end
331
+ end
332
+ end