ronin-web 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,511 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ require 'uri'
25
+ require 'cgi'
26
+
27
+ begin
28
+ require 'mongrel'
29
+ rescue
30
+ require 'webrick'
31
+ end
32
+
33
+ require 'rack'
34
+
35
+ module Ronin
36
+ module Web
37
+ class Server
38
+
39
+ # Default interface to run the Web Server on
40
+ HOST = '0.0.0.0'
41
+
42
+ # Default port to run the Web Server on
43
+ PORT = 8080
44
+
45
+ # Directory index files
46
+ INDICES = ['index.htm', 'index.html']
47
+
48
+ # The host to bind to
49
+ attr_accessor :host
50
+
51
+ # The port to listen on
52
+ attr_accessor :port
53
+
54
+ # The Hash of configurable variables for the server
55
+ attr_reader :config
56
+
57
+ #
58
+ # Creates a new Web Server using the given configuration _block_.
59
+ #
60
+ # _options_ may contain the following keys:
61
+ # <tt>:host</tt>:: The host to bind to.
62
+ # <tt>:port</tt>:: The port to listen on.
63
+ # <tt>:config</tt>:: A +Hash+ of configurable variables to be used
64
+ # in responses.
65
+ #
66
+ def initialize(options={},&block)
67
+ @host = options[:host]
68
+ @port = options[:port]
69
+ @config = {}
70
+
71
+ if options.has_key?(:config)
72
+ @config.merge!(options[:config])
73
+ end
74
+
75
+ @default = method(:not_found)
76
+
77
+ @virtual_host_patterns = {}
78
+ @virtual_hosts = {}
79
+
80
+ @path_patterns = {}
81
+ @paths = {}
82
+ @directories = {}
83
+
84
+ instance_eval(&block) if block
85
+ end
86
+
87
+ #
88
+ # Returns the default host that the Web Server will be run on.
89
+ #
90
+ def Server.default_host
91
+ @@default_host ||= HOST
92
+ end
93
+
94
+ #
95
+ # Sets the default host that the Web Server will run on to the
96
+ # specified _host_.
97
+ #
98
+ def Server.default_host=(host)
99
+ @@default_host = host
100
+ end
101
+
102
+ #
103
+ # Returns the default port that the Web Server will run on.
104
+ #
105
+ def Server.default_port
106
+ @@default_port ||= PORT
107
+ end
108
+
109
+ #
110
+ # Sets the default port the Web Server will run on to the specified
111
+ # _port_.
112
+ #
113
+ def Server.default_port=(port)
114
+ @@default_port = port
115
+ end
116
+
117
+ #
118
+ # The Hash of the servers supported file extensions and their HTTP
119
+ # Content-Types.
120
+ #
121
+ def Server.content_types
122
+ @@content_types ||= {}
123
+ end
124
+
125
+ #
126
+ # Registers a new content _type_ for the specified file _extensions_.
127
+ #
128
+ # Server.content_type 'text/xml', ['xml', 'xsl']
129
+ #
130
+ def self.content_type(type,extensions)
131
+ extensions.each { |ext| Server.content_types[ext] = type }
132
+
133
+ return self
134
+ end
135
+
136
+ #
137
+ # Runs the specified _server_ with the given _options_. Server.run
138
+ # will use Mongrel to run the _server_, if it is installed. Otherwise
139
+ # WEBrick will be used to run the _server_.
140
+ #
141
+ # _options_ can contain the following keys:
142
+ # <tt>:host</tt>:: The host the server will bind to, defaults to
143
+ # Server.default_host.
144
+ # <tt>:port</tt>:: The port the server will listen on, defaults to
145
+ # Server.default_port.
146
+ #
147
+ def Server.run(server,options={})
148
+ rack_options = {}
149
+
150
+ rack_options[:Host] = (options[:host] || Server.default_host)
151
+ rack_options[:Port] = (options[:port] || Server.default_port)
152
+
153
+ if Object.const_defined?('Mongrel')
154
+ Rack::Handler::Mongrel.run(server,rack_options)
155
+ else
156
+ Rack::Handler::WEBrick.run(server,rack_options)
157
+ end
158
+ end
159
+
160
+ #
161
+ # Creates a new Web Server object with the given _block_ and starts
162
+ # it using the given _options_.
163
+ #
164
+ def self.start(options={},&block)
165
+ self.new(options,&block).start
166
+ end
167
+
168
+ #
169
+ # Returns the HTTP Content-Type for the specified file _extension_.
170
+ #
171
+ # content_type('html')
172
+ # # => "text/html"
173
+ #
174
+ def content_type(extension)
175
+ Server.content_types[extension] || 'application/x-unknown-content-type'
176
+ end
177
+
178
+ #
179
+ # Returns the HTTP Content-Type for the specified _file_.
180
+ #
181
+ # srv.content_type_for('file.html')
182
+ # # => "text/html"
183
+ #
184
+ def content_type_for(file)
185
+ ext = File.extname(file).downcase
186
+
187
+ return content_type(ext[1..-1])
188
+ end
189
+
190
+ #
191
+ # Returns the index file contained within the _path_ of the specified
192
+ # directory. If no index file can be found, +nil+ will be returned.
193
+ #
194
+ def index_of(path)
195
+ path = File.expand_path(path)
196
+
197
+ INDICES.each do |name|
198
+ index = File.join(path,name)
199
+
200
+ return index if File.file?(index)
201
+ end
202
+
203
+ return nil
204
+ end
205
+
206
+ #
207
+ # Returns the HTTP 404 Not Found message for the requested path.
208
+ #
209
+ def not_found(env)
210
+ path = env['PATH_INFO']
211
+ body = %{<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
212
+ <html>
213
+ <head>
214
+ <title>404 Not Found</title>
215
+ <body>
216
+ <h1>Not Found</h1>
217
+ <p>The requested URL #{CGI.escapeHTML(path)} was not found on this server.</p>
218
+ <hr>
219
+ </body>
220
+ </html>}
221
+
222
+ return response(body, :status => 404, :content_type => 'text/html')
223
+ end
224
+
225
+ #
226
+ # Returns the contents of the file at the specified _path_. If the
227
+ # _path_ points to a directory, the directory will be searched for
228
+ # an index file. If no index file can be found or _path_ points to a
229
+ # non-existant file, a "404 Not Found" response will be returned.
230
+ #
231
+ def return_file(path,env)
232
+ if !(File.exists?(path))
233
+ return not_found(env)
234
+ end
235
+
236
+ if File.directory?(path)
237
+ unless (path = index_of(path))
238
+ return not_found(env)
239
+ end
240
+ end
241
+
242
+ return response(File.new(path), :content_type => content_type_for(path))
243
+ end
244
+
245
+ #
246
+ # Returns a Rack Response object with the specified _body_, the given
247
+ # _options_ and the given _block_.
248
+ #
249
+ # _options_ may include the following keys:
250
+ # <tt>:status</tt>:: The HTTP Response status code, defaults to 200.
251
+ #
252
+ # response("<data>lol</data>", :content_type => 'text/xml')
253
+ #
254
+ def response(body=[],options={},&block)
255
+ status = (options.delete(:status) || 200)
256
+ headers = {}
257
+
258
+ options.each do |name,value|
259
+ header_name = name.to_s.split('_').map { |word|
260
+ word.capitalize
261
+ }.join('-')
262
+
263
+ headers[header_name] = value.to_s
264
+ end
265
+
266
+ return Rack::Response.new(body,status,headers,&block)
267
+ end
268
+
269
+ #
270
+ # Use the specified _block_ as the default route for all other
271
+ # requests.
272
+ #
273
+ # default do |env|
274
+ # [200, {'Content-Type' => 'text/html'}, 'lol train']
275
+ # end
276
+ #
277
+ def default(&block)
278
+ @default = block
279
+ return self
280
+ end
281
+
282
+ #
283
+ # Connects the specified _server_ as a virtual host representing the
284
+ # specified host _name_.
285
+ #
286
+ def connect(name,server)
287
+ @virtual_hosts[name.to_s] = server
288
+ end
289
+
290
+ #
291
+ # Returns the server that handles requests for the specified host
292
+ # _name_.
293
+ #
294
+ def virtual_host(name)
295
+ name = name.to_s
296
+
297
+ if @virtual_hosts.has_key?(name)
298
+ return @virtual_hosts[name]
299
+ end
300
+
301
+ @virtual_host_patterns.each do |pattern,server|
302
+ return server if name.match(pattern)
303
+ end
304
+
305
+ return nil
306
+ end
307
+
308
+ #
309
+ # Registers the specified _block_ to be called when receiving
310
+ # requests to host names which match the specified _pattern_.
311
+ #
312
+ # hosts_like(/^a[0-9]\./) do
313
+ # map('/download/') do |env|
314
+ # ...
315
+ # end
316
+ # end
317
+ #
318
+ def hosts_like(pattern,&block)
319
+ @virtual_host_patterns[pattern] = self.class.new(&block)
320
+ end
321
+
322
+ #
323
+ # Registers the specified _block_ to be called when receiving
324
+ # requests for paths which match the specified _pattern_.
325
+ #
326
+ # paths_like(/\.xml$/) do |env|
327
+ # ...
328
+ # end
329
+ #
330
+ def paths_like(pattern,&block)
331
+ @path_patterns[pattern] = block
332
+ return self
333
+ end
334
+
335
+ #
336
+ # Creates a new Server object using the specified _block_ and
337
+ # connects it as a virtual host representing the specified host
338
+ # _name_.
339
+ #
340
+ # host('cdn.evil.com') do
341
+ # ...
342
+ # end
343
+ #
344
+ def host(name,&block)
345
+ connect(name,self.class.new(&block))
346
+ end
347
+
348
+ #
349
+ # Binds the specified URL _path_ to the given _block_.
350
+ #
351
+ # bind '/secrets.xml' do |env|
352
+ # [200, {'Content-Type' => 'text/xml'}, "Made you look."]
353
+ # end
354
+ #
355
+ def bind(path,&block)
356
+ @paths[path] = block
357
+ return self
358
+ end
359
+
360
+ #
361
+ # Binds the specified URL directory _path_ to the given _block_.
362
+ #
363
+ # map '/downloads' do |env|
364
+ # response(
365
+ # "Your somewhere inside the downloads directory",
366
+ # :content_type' => 'text/xml'
367
+ # )
368
+ # end
369
+ #
370
+ def map(path,&block)
371
+ @directories[path] = block
372
+ return self
373
+ end
374
+
375
+ #
376
+ # Binds the contents of the specified _file_ to the specified URL
377
+ # _path_, using the given _options_.
378
+ #
379
+ # file '/robots.txt', '/path/to/my_robots.txt'
380
+ #
381
+ def file(path,file,options={})
382
+ file = File.expand_path(file)
383
+ content_type = (options[:content_type] || content_type_for(file))
384
+
385
+ bind(path) do |env|
386
+ if File.file?(file)
387
+ return_file(file,env)
388
+ else
389
+ not_found(env)
390
+ end
391
+ end
392
+ end
393
+
394
+ #
395
+ # Mounts the contents of the specified _directory_ to the given
396
+ # prefix _path_.
397
+ #
398
+ # mount '/download/', '/tmp/files/'
399
+ #
400
+ def mount(path,directory)
401
+ sub_dirs = path.split('/')
402
+ directory = File.expand_path(directory)
403
+
404
+ map(path) do |env|
405
+ http_path = File.expand_path(env['PATH_INFO'])
406
+ http_dirs = http_path.split('/')
407
+
408
+ sub_path = http_dirs[sub_dirs.length..-1].join('/')
409
+ absolute_path = File.join(directory,sub_path)
410
+
411
+ return_file(absolute_path,env)
412
+ end
413
+ end
414
+
415
+ #
416
+ # Starts the server.
417
+ #
418
+ def start
419
+ Server.run(self, :host => @host, :port => @port)
420
+ return self
421
+ end
422
+
423
+ #
424
+ # The method which receives all requests.
425
+ #
426
+ def call(env)
427
+ http_host = env['HTTP_HOST']
428
+ http_path = File.expand_path(env['PATH_INFO'])
429
+
430
+ if http_host
431
+ if (server = virtual_host(http_host))
432
+ return server.call(env)
433
+ end
434
+ end
435
+
436
+ if http_path
437
+ if (block = @paths[http_path])
438
+ return block.call(env)
439
+ end
440
+
441
+ @path_patterns.each do |pattern,block|
442
+ if http_path.match(pattern)
443
+ return block.call(env)
444
+ end
445
+ end
446
+
447
+ http_dirs = http_path.split('/')
448
+
449
+ sub_dir = @directories.keys.select { |path|
450
+ dirs = path.split('/')
451
+
452
+ http_dirs[0...dirs.length] == dirs
453
+ }.sort.last
454
+
455
+ if (sub_dir && (block = @directories[sub_dir]))
456
+ return block.call(env)
457
+ end
458
+ end
459
+
460
+ return @default.call(env)
461
+ end
462
+
463
+ #
464
+ # Routes the specified _url_ to the call method.
465
+ #
466
+ def route(url)
467
+ url = URI(url.to_s)
468
+
469
+ return call(
470
+ 'HTTP_HOST' => url.host,
471
+ 'HTTP_PORT' => url.port,
472
+ 'SERVER_PORT' => url.port,
473
+ 'PATH_INFO' => url.path,
474
+ 'QUERY_STRING' => url.query
475
+ )
476
+ end
477
+
478
+ #
479
+ # Routes the specified _path_ to the call method.
480
+ #
481
+ def route_path(path)
482
+ path, query = URI.decode(path.to_s).split('?',2)
483
+
484
+ return route(URI::HTTP.build(
485
+ :host => @host,
486
+ :port => @port,
487
+ :path => path,
488
+ :query => query
489
+ ))
490
+ end
491
+
492
+ protected
493
+
494
+ content_type 'text/html', ['html', 'htm', 'xhtml']
495
+ content_type 'text/css', ['css']
496
+ content_type 'text/gif', ['gif']
497
+ content_type 'text/jpeg', ['jpeg', 'jpg']
498
+ content_type 'text/png', ['png']
499
+ content_type 'image/x-icon', ['ico']
500
+ content_type 'text/javascript', ['js']
501
+ content_type 'text/xml', ['xml', 'xsl']
502
+ content_type 'application/rss+xml', ['rss']
503
+ content_type 'application/rdf+xml', ['rdf']
504
+ content_type 'application/pdf', ['pdf']
505
+ content_type 'application/doc', ['doc']
506
+ content_type 'application/zip', ['zip']
507
+ content_type 'text/plain', ['txt', 'conf', 'rb', 'py', 'h', 'c', 'hh', 'cc', 'hpp', 'cpp']
508
+
509
+ end
510
+ end
511
+ end
@@ -0,0 +1,89 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ require 'ronin/web/web'
25
+
26
+ require 'spidr/agent'
27
+
28
+ module Ronin
29
+ module Web
30
+ class Spider < Spidr::Agent
31
+ #
32
+ # Creates a new Spider object with the given _options_ and
33
+ # _block_. If a _block_ is given, it will be passed the newly created
34
+ # Spider object.
35
+ #
36
+ # _options_ may contain the following keys:
37
+ # <tt>:proxy</tt>:: The proxy to use while spidering. Defaults to
38
+ # Web.proxy.
39
+ # <tt>:user_agent</tt>:: The User-Agent string to send. Defaults to
40
+ # Web.user_agent.
41
+ # <tt>:referer</tt>:: The referer URL to send.
42
+ # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
43
+ # link. Defaults to 0.
44
+ # <tt>:host</tt>:: The host-name to visit.
45
+ # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
46
+ # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
47
+ # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
48
+ # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
49
+ # <tt>:links</tt>:: An +Array+ of link patterns to visit.
50
+ # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
51
+ # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
52
+ # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
53
+ # visit.
54
+ #
55
+ def self.agent(options={},&block)
56
+ self.new(self.default_options.merge(options),&block)
57
+ end
58
+
59
+ #
60
+ # Creates a new Spider object with the given _options_ and will begin
61
+ # spidering the specified host _name_. If a _block_ is given it
62
+ # will be passed the newly created Spider object, before the agent
63
+ # begins spidering.
64
+ #
65
+ def self.host(name,options={},&block)
66
+ super(name,self.default_options.merge(options),&block)
67
+ end
68
+
69
+ #
70
+ # Creates a new Spider object with the given _options_ and will begin
71
+ # spidering the host of the specified _url_. If a _block_ is
72
+ # given it will be passed the newly created Spider object, before
73
+ # the agent begins spidering.
74
+ #
75
+ def self.site(url,options={},&block)
76
+ super(url,self.default_options.merge(options),&block)
77
+ end
78
+
79
+ protected
80
+
81
+ #
82
+ # Returns the default options for Spider.
83
+ #
84
+ def self.default_options
85
+ {:proxy => Web.proxy, :user_agent => Web.user_agent}
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,29 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ module Ronin
25
+ module Web
26
+ # Ronin Web Version
27
+ VERSION = '0.1.0'
28
+ end
29
+ end