ronin-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,511 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ require 'uri'
25
+ require 'cgi'
26
+
27
+ begin
28
+ require 'mongrel'
29
+ rescue
30
+ require 'webrick'
31
+ end
32
+
33
+ require 'rack'
34
+
35
+ module Ronin
36
+ module Web
37
+ class Server
38
+
39
+ # Default interface to run the Web Server on
40
+ HOST = '0.0.0.0'
41
+
42
+ # Default port to run the Web Server on
43
+ PORT = 8080
44
+
45
+ # Directory index files
46
+ INDICES = ['index.htm', 'index.html']
47
+
48
+ # The host to bind to
49
+ attr_accessor :host
50
+
51
+ # The port to listen on
52
+ attr_accessor :port
53
+
54
+ # The Hash of configurable variables for the server
55
+ attr_reader :config
56
+
57
+ #
58
+ # Creates a new Web Server using the given configuration _block_.
59
+ #
60
+ # _options_ may contain the following keys:
61
+ # <tt>:host</tt>:: The host to bind to.
62
+ # <tt>:port</tt>:: The port to listen on.
63
+ # <tt>:config</tt>:: A +Hash+ of configurable variables to be used
64
+ # in responses.
65
+ #
66
+ def initialize(options={},&block)
67
+ @host = options[:host]
68
+ @port = options[:port]
69
+ @config = {}
70
+
71
+ if options.has_key?(:config)
72
+ @config.merge!(options[:config])
73
+ end
74
+
75
+ @default = method(:not_found)
76
+
77
+ @virtual_host_patterns = {}
78
+ @virtual_hosts = {}
79
+
80
+ @path_patterns = {}
81
+ @paths = {}
82
+ @directories = {}
83
+
84
+ instance_eval(&block) if block
85
+ end
86
+
87
+ #
88
+ # Returns the default host that the Web Server will be run on.
89
+ #
90
+ def Server.default_host
91
+ @@default_host ||= HOST
92
+ end
93
+
94
+ #
95
+ # Sets the default host that the Web Server will run on to the
96
+ # specified _host_.
97
+ #
98
+ def Server.default_host=(host)
99
+ @@default_host = host
100
+ end
101
+
102
+ #
103
+ # Returns the default port that the Web Server will run on.
104
+ #
105
+ def Server.default_port
106
+ @@default_port ||= PORT
107
+ end
108
+
109
+ #
110
+ # Sets the default port the Web Server will run on to the specified
111
+ # _port_.
112
+ #
113
+ def Server.default_port=(port)
114
+ @@default_port = port
115
+ end
116
+
117
+ #
118
+ # The Hash of the servers supported file extensions and their HTTP
119
+ # Content-Types.
120
+ #
121
+ def Server.content_types
122
+ @@content_types ||= {}
123
+ end
124
+
125
+ #
126
+ # Registers a new content _type_ for the specified file _extensions_.
127
+ #
128
+ # Server.content_type 'text/xml', ['xml', 'xsl']
129
+ #
130
+ def self.content_type(type,extensions)
131
+ extensions.each { |ext| Server.content_types[ext] = type }
132
+
133
+ return self
134
+ end
135
+
136
+ #
137
+ # Runs the specified _server_ with the given _options_. Server.run
138
+ # will use Mongrel to run the _server_, if it is installed. Otherwise
139
+ # WEBrick will be used to run the _server_.
140
+ #
141
+ # _options_ can contain the following keys:
142
+ # <tt>:host</tt>:: The host the server will bind to, defaults to
143
+ # Server.default_host.
144
+ # <tt>:port</tt>:: The port the server will listen on, defaults to
145
+ # Server.default_port.
146
+ #
147
+ def Server.run(server,options={})
148
+ rack_options = {}
149
+
150
+ rack_options[:Host] = (options[:host] || Server.default_host)
151
+ rack_options[:Port] = (options[:port] || Server.default_port)
152
+
153
+ if Object.const_defined?('Mongrel')
154
+ Rack::Handler::Mongrel.run(server,rack_options)
155
+ else
156
+ Rack::Handler::WEBrick.run(server,rack_options)
157
+ end
158
+ end
159
+
160
+ #
161
+ # Creates a new Web Server object with the given _block_ and starts
162
+ # it using the given _options_.
163
+ #
164
+ def self.start(options={},&block)
165
+ self.new(options,&block).start
166
+ end
167
+
168
+ #
169
+ # Returns the HTTP Content-Type for the specified file _extension_.
170
+ #
171
+ # content_type('html')
172
+ # # => "text/html"
173
+ #
174
+ def content_type(extension)
175
+ Server.content_types[extension] || 'application/x-unknown-content-type'
176
+ end
177
+
178
+ #
179
+ # Returns the HTTP Content-Type for the specified _file_.
180
+ #
181
+ # srv.content_type_for('file.html')
182
+ # # => "text/html"
183
+ #
184
+ def content_type_for(file)
185
+ ext = File.extname(file).downcase
186
+
187
+ return content_type(ext[1..-1])
188
+ end
189
+
190
+ #
191
+ # Returns the index file contained within the _path_ of the specified
192
+ # directory. If no index file can be found, +nil+ will be returned.
193
+ #
194
+ def index_of(path)
195
+ path = File.expand_path(path)
196
+
197
+ INDICES.each do |name|
198
+ index = File.join(path,name)
199
+
200
+ return index if File.file?(index)
201
+ end
202
+
203
+ return nil
204
+ end
205
+
206
+ #
207
+ # Returns the HTTP 404 Not Found message for the requested path.
208
+ #
209
+ def not_found(env)
210
+ path = env['PATH_INFO']
211
+ body = %{<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
212
+ <html>
213
+ <head>
214
+ <title>404 Not Found</title>
215
+ <body>
216
+ <h1>Not Found</h1>
217
+ <p>The requested URL #{CGI.escapeHTML(path)} was not found on this server.</p>
218
+ <hr>
219
+ </body>
220
+ </html>}
221
+
222
+ return response(body, :status => 404, :content_type => 'text/html')
223
+ end
224
+
225
+ #
226
+ # Returns the contents of the file at the specified _path_. If the
227
+ # _path_ points to a directory, the directory will be searched for
228
+ # an index file. If no index file can be found or _path_ points to a
229
+ # non-existant file, a "404 Not Found" response will be returned.
230
+ #
231
+ def return_file(path,env)
232
+ if !(File.exists?(path))
233
+ return not_found(env)
234
+ end
235
+
236
+ if File.directory?(path)
237
+ unless (path = index_of(path))
238
+ return not_found(env)
239
+ end
240
+ end
241
+
242
+ return response(File.new(path), :content_type => content_type_for(path))
243
+ end
244
+
245
+ #
246
+ # Returns a Rack Response object with the specified _body_, the given
247
+ # _options_ and the given _block_.
248
+ #
249
+ # _options_ may include the following keys:
250
+ # <tt>:status</tt>:: The HTTP Response status code, defaults to 200.
251
+ #
252
+ # response("<data>lol</data>", :content_type => 'text/xml')
253
+ #
254
+ def response(body=[],options={},&block)
255
+ status = (options.delete(:status) || 200)
256
+ headers = {}
257
+
258
+ options.each do |name,value|
259
+ header_name = name.to_s.split('_').map { |word|
260
+ word.capitalize
261
+ }.join('-')
262
+
263
+ headers[header_name] = value.to_s
264
+ end
265
+
266
+ return Rack::Response.new(body,status,headers,&block)
267
+ end
268
+
269
+ #
270
+ # Use the specified _block_ as the default route for all other
271
+ # requests.
272
+ #
273
+ # default do |env|
274
+ # [200, {'Content-Type' => 'text/html'}, 'lol train']
275
+ # end
276
+ #
277
+ def default(&block)
278
+ @default = block
279
+ return self
280
+ end
281
+
282
+ #
283
+ # Connects the specified _server_ as a virtual host representing the
284
+ # specified host _name_.
285
+ #
286
+ def connect(name,server)
287
+ @virtual_hosts[name.to_s] = server
288
+ end
289
+
290
+ #
291
+ # Returns the server that handles requests for the specified host
292
+ # _name_.
293
+ #
294
+ def virtual_host(name)
295
+ name = name.to_s
296
+
297
+ if @virtual_hosts.has_key?(name)
298
+ return @virtual_hosts[name]
299
+ end
300
+
301
+ @virtual_host_patterns.each do |pattern,server|
302
+ return server if name.match(pattern)
303
+ end
304
+
305
+ return nil
306
+ end
307
+
308
+ #
309
+ # Registers the specified _block_ to be called when receiving
310
+ # requests to host names which match the specified _pattern_.
311
+ #
312
+ # hosts_like(/^a[0-9]\./) do
313
+ # map('/download/') do |env|
314
+ # ...
315
+ # end
316
+ # end
317
+ #
318
+ def hosts_like(pattern,&block)
319
+ @virtual_host_patterns[pattern] = self.class.new(&block)
320
+ end
321
+
322
+ #
323
+ # Registers the specified _block_ to be called when receiving
324
+ # requests for paths which match the specified _pattern_.
325
+ #
326
+ # paths_like(/\.xml$/) do |env|
327
+ # ...
328
+ # end
329
+ #
330
+ def paths_like(pattern,&block)
331
+ @path_patterns[pattern] = block
332
+ return self
333
+ end
334
+
335
+ #
336
+ # Creates a new Server object using the specified _block_ and
337
+ # connects it as a virtual host representing the specified host
338
+ # _name_.
339
+ #
340
+ # host('cdn.evil.com') do
341
+ # ...
342
+ # end
343
+ #
344
+ def host(name,&block)
345
+ connect(name,self.class.new(&block))
346
+ end
347
+
348
+ #
349
+ # Binds the specified URL _path_ to the given _block_.
350
+ #
351
+ # bind '/secrets.xml' do |env|
352
+ # [200, {'Content-Type' => 'text/xml'}, "Made you look."]
353
+ # end
354
+ #
355
+ def bind(path,&block)
356
+ @paths[path] = block
357
+ return self
358
+ end
359
+
360
+ #
361
+ # Binds the specified URL directory _path_ to the given _block_.
362
+ #
363
+ # map '/downloads' do |env|
364
+ # response(
365
+ # "Your somewhere inside the downloads directory",
366
+ # :content_type' => 'text/xml'
367
+ # )
368
+ # end
369
+ #
370
+ def map(path,&block)
371
+ @directories[path] = block
372
+ return self
373
+ end
374
+
375
+ #
376
+ # Binds the contents of the specified _file_ to the specified URL
377
+ # _path_, using the given _options_.
378
+ #
379
+ # file '/robots.txt', '/path/to/my_robots.txt'
380
+ #
381
+ def file(path,file,options={})
382
+ file = File.expand_path(file)
383
+ content_type = (options[:content_type] || content_type_for(file))
384
+
385
+ bind(path) do |env|
386
+ if File.file?(file)
387
+ return_file(file,env)
388
+ else
389
+ not_found(env)
390
+ end
391
+ end
392
+ end
393
+
394
+ #
395
+ # Mounts the contents of the specified _directory_ to the given
396
+ # prefix _path_.
397
+ #
398
+ # mount '/download/', '/tmp/files/'
399
+ #
400
+ def mount(path,directory)
401
+ sub_dirs = path.split('/')
402
+ directory = File.expand_path(directory)
403
+
404
+ map(path) do |env|
405
+ http_path = File.expand_path(env['PATH_INFO'])
406
+ http_dirs = http_path.split('/')
407
+
408
+ sub_path = http_dirs[sub_dirs.length..-1].join('/')
409
+ absolute_path = File.join(directory,sub_path)
410
+
411
+ return_file(absolute_path,env)
412
+ end
413
+ end
414
+
415
+ #
416
+ # Starts the server.
417
+ #
418
+ def start
419
+ Server.run(self, :host => @host, :port => @port)
420
+ return self
421
+ end
422
+
423
+ #
424
+ # The method which receives all requests.
425
+ #
426
+ def call(env)
427
+ http_host = env['HTTP_HOST']
428
+ http_path = File.expand_path(env['PATH_INFO'])
429
+
430
+ if http_host
431
+ if (server = virtual_host(http_host))
432
+ return server.call(env)
433
+ end
434
+ end
435
+
436
+ if http_path
437
+ if (block = @paths[http_path])
438
+ return block.call(env)
439
+ end
440
+
441
+ @path_patterns.each do |pattern,block|
442
+ if http_path.match(pattern)
443
+ return block.call(env)
444
+ end
445
+ end
446
+
447
+ http_dirs = http_path.split('/')
448
+
449
+ sub_dir = @directories.keys.select { |path|
450
+ dirs = path.split('/')
451
+
452
+ http_dirs[0...dirs.length] == dirs
453
+ }.sort.last
454
+
455
+ if (sub_dir && (block = @directories[sub_dir]))
456
+ return block.call(env)
457
+ end
458
+ end
459
+
460
+ return @default.call(env)
461
+ end
462
+
463
+ #
464
+ # Routes the specified _url_ to the call method.
465
+ #
466
+ def route(url)
467
+ url = URI(url.to_s)
468
+
469
+ return call(
470
+ 'HTTP_HOST' => url.host,
471
+ 'HTTP_PORT' => url.port,
472
+ 'SERVER_PORT' => url.port,
473
+ 'PATH_INFO' => url.path,
474
+ 'QUERY_STRING' => url.query
475
+ )
476
+ end
477
+
478
+ #
479
+ # Routes the specified _path_ to the call method.
480
+ #
481
+ def route_path(path)
482
+ path, query = URI.decode(path.to_s).split('?',2)
483
+
484
+ return route(URI::HTTP.build(
485
+ :host => @host,
486
+ :port => @port,
487
+ :path => path,
488
+ :query => query
489
+ ))
490
+ end
491
+
492
+ protected
493
+
494
+ content_type 'text/html', ['html', 'htm', 'xhtml']
495
+ content_type 'text/css', ['css']
496
+ content_type 'text/gif', ['gif']
497
+ content_type 'text/jpeg', ['jpeg', 'jpg']
498
+ content_type 'text/png', ['png']
499
+ content_type 'image/x-icon', ['ico']
500
+ content_type 'text/javascript', ['js']
501
+ content_type 'text/xml', ['xml', 'xsl']
502
+ content_type 'application/rss+xml', ['rss']
503
+ content_type 'application/rdf+xml', ['rdf']
504
+ content_type 'application/pdf', ['pdf']
505
+ content_type 'application/doc', ['doc']
506
+ content_type 'application/zip', ['zip']
507
+ content_type 'text/plain', ['txt', 'conf', 'rb', 'py', 'h', 'c', 'hh', 'cc', 'hpp', 'cpp']
508
+
509
+ end
510
+ end
511
+ end
@@ -0,0 +1,89 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ require 'ronin/web/web'
25
+
26
+ require 'spidr/agent'
27
+
28
+ module Ronin
29
+ module Web
30
+ class Spider < Spidr::Agent
31
+ #
32
+ # Creates a new Spider object with the given _options_ and
33
+ # _block_. If a _block_ is given, it will be passed the newly created
34
+ # Spider object.
35
+ #
36
+ # _options_ may contain the following keys:
37
+ # <tt>:proxy</tt>:: The proxy to use while spidering. Defaults to
38
+ # Web.proxy.
39
+ # <tt>:user_agent</tt>:: The User-Agent string to send. Defaults to
40
+ # Web.user_agent.
41
+ # <tt>:referer</tt>:: The referer URL to send.
42
+ # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
43
+ # link. Defaults to 0.
44
+ # <tt>:host</tt>:: The host-name to visit.
45
+ # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
46
+ # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
47
+ # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
48
+ # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
49
+ # <tt>:links</tt>:: An +Array+ of link patterns to visit.
50
+ # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
51
+ # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
52
+ # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
53
+ # visit.
54
+ #
55
+ def self.agent(options={},&block)
56
+ self.new(self.default_options.merge(options),&block)
57
+ end
58
+
59
+ #
60
+ # Creates a new Spider object with the given _options_ and will begin
61
+ # spidering the specified host _name_. If a _block_ is given it
62
+ # will be passed the newly created Spider object, before the agent
63
+ # begins spidering.
64
+ #
65
+ def self.host(name,options={},&block)
66
+ super(name,self.default_options.merge(options),&block)
67
+ end
68
+
69
+ #
70
+ # Creates a new Spider object with the given _options_ and will begin
71
+ # spidering the host of the specified _url_. If a _block_ is
72
+ # given it will be passed the newly created Spider object, before
73
+ # the agent begins spidering.
74
+ #
75
+ def self.site(url,options={},&block)
76
+ super(url,self.default_options.merge(options),&block)
77
+ end
78
+
79
+ protected
80
+
81
+ #
82
+ # Returns the default options for Spider.
83
+ #
84
+ def self.default_options
85
+ {:proxy => Web.proxy, :user_agent => Web.user_agent}
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,29 @@
1
+ #
2
+ #--
3
+ # Ronin Web - A Ruby library for Ronin that provides support for web
4
+ # scraping and spidering functionality.
5
+ #
6
+ # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
7
+ #
8
+ # This program is free software; you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation; either version 2 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # This program is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with this program; if not, write to the Free Software
20
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21
+ #++
22
+ #
23
+
24
+ module Ronin
25
+ module Web
26
+ # Ronin Web Version
27
+ VERSION = '0.1.0'
28
+ end
29
+ end