ronin-web 0.3.0.pre2 → 0.3.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
@@ -85,6 +85,9 @@ module Ronin
85
85
  # @option options [Array<String, Regexp, Proc>] :ignore_exts
86
86
  # The patterns which match the URI path extensions to not visit.
87
87
  #
88
+ # @option options [Boolean] :verbose (true)
89
+ # Specifies whether every URL will be printed.
90
+ #
88
91
  # @yield [spider]
89
92
  # If a block is given, it will be passed the newly created spider.
90
93
  #
@@ -103,8 +106,10 @@ module Ronin
103
106
 
104
107
  super(options)
105
108
 
106
- every_url do |url|
107
- print_info("Spidering #{url}")
109
+ if options.fetch(:verbose,true)
110
+ every_url do |url|
111
+ print_info("Spidering #{url}")
112
+ end
108
113
  end
109
114
 
110
115
  yield self if block_given?
@@ -0,0 +1,196 @@
1
+ #
2
+ # Ronin Web - A Ruby library for Ronin that provides support for web
3
+ # scraping and spidering functionality.
4
+ #
5
+ # Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This file is part of Ronin Web.
8
+ #
9
+ # Ronin is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # Ronin is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with Ronin. If not, see <http://www.gnu.org/licenses/>.
21
+ #
22
+
23
+ require 'ronin/web/config'
24
+
25
+ require 'set'
26
+
27
+ module Ronin
28
+ module Web
29
+ #
30
+ # Represents the set of `User-Agent` strings loaded from all
31
+ # `data/ronin/web/user_agents.yml` files.
32
+ #
33
+ # ## ronin/web/user_agents.yml
34
+ #
35
+ # The `user_agent.yml` files are essentially YAML files listing
36
+ # `User-Agent` strings grouped by category:
37
+ #
38
+ # ---
39
+ # :googlebot:
40
+ # - "Googlebot/2.1 ( http://www.googlebot.com/bot.html)"
41
+ # - "Googlebot-Image/1.0 ( http://www.googlebot.com/bot.html)"
42
+ # - "Mediapartners-Google/2.1"
43
+ # - "Google-Sitemaps/1.0"
44
+ #
45
+ # These files can be added to Ronin Repositories or to Ronin libraries,
46
+ # and will be loaded by the {UserAgents} objects.
47
+ #
48
+ # @since 0.3.0
49
+ #
50
+ class UserAgents
51
+
52
+ include Enumerable
53
+
54
+ # Relative path to the User-Agents file.
55
+ FILE = File.join('ronin','web','user_agents.yml')
56
+
57
+ #
58
+ # Creates a new User-Agent set.
59
+ #
60
+ # @api semipublic
61
+ #
62
+ def initialize
63
+ @files = Set[]
64
+ @user_agents = Hash.new { |hash,key| hash[key] = Set[] }
65
+ end
66
+
67
+ #
68
+ # The categories of `User-Agent` strings.
69
+ #
70
+ # @return [Array<Symbol>]
71
+ # The names of the categories.
72
+ #
73
+ # @api public
74
+ #
75
+ def categories
76
+ reload!
77
+
78
+ @user_agents.keys
79
+ end
80
+
81
+ #
82
+ # Iterates over each User-Agent in the set.
83
+ #
84
+ # @yield [ua]
85
+ # The given block will be passed each User-Agent.
86
+ #
87
+ # @yieldparam [String] ua
88
+ # A User-Agent string within the set.
89
+ #
90
+ # @return [Enumerator]
91
+ # If no block is given, an Enmerator will be returned.
92
+ #
93
+ # @api public
94
+ #
95
+ def each(&block)
96
+ return enum_for(:each) unless block_given?
97
+
98
+ @user_agents.each do |name,strings|
99
+ strings.each(&block)
100
+ end
101
+ end
102
+
103
+ #
104
+ # Selects a `User-Agent` string from the set.
105
+ #
106
+ # @param [Symbol, String, Regexp] key
107
+ # The User-Agents group name, sub-string or Regexp to search for.
108
+ #
109
+ # @return [String, nil]
110
+ # The matching `User-Agent` string.
111
+ #
112
+ # @api public
113
+ #
114
+ def [](key)
115
+ reload!
116
+
117
+ case key
118
+ when Symbol
119
+ if @user_agents.has_key?(key)
120
+ strings = @user_agents[key]
121
+ return strings.entries[rand(strings.length)]
122
+ end
123
+ when String
124
+ @user_agents.each do |name,strings|
125
+ strings.each do |string|
126
+ return string if string.include?(key)
127
+ end
128
+ end
129
+
130
+ return nil
131
+ when Regexp
132
+ @user_agents.each do |name,strings|
133
+ strings.each do |string|
134
+ return string if string =~ key
135
+ end
136
+ end
137
+
138
+ return nil
139
+ else
140
+ raise(TypeError,"key must be a Symbol, String or Regexp")
141
+ end
142
+ end
143
+
144
+ #
145
+ # Fetches a `User-Agent` string from the set.
146
+ #
147
+ # @param [Symbol, String, Regexp] key
148
+ # The User-Agents group name, sub-string or Regexp to search for.
149
+ #
150
+ # @param [String] default
151
+ # The `User-Agent` string to default to if no match is found.
152
+ #
153
+ # @return [String]
154
+ # The matching `User-Agent` string.
155
+ #
156
+ # @raise [ArgumentError]
157
+ # No matching `User-Agent` string was found, and no default value
158
+ # was given.
159
+ #
160
+ # @api public
161
+ #
162
+ def fetch(key,default=nil)
163
+ unless (string = (self[key] || default))
164
+ raise(ArgumentError,"no User-Agent strings match #{key.inspect}")
165
+ end
166
+
167
+ return string
168
+ end
169
+
170
+ protected
171
+
172
+ #
173
+ # Reloads the set of User-Agents.
174
+ #
175
+ # @api private
176
+ #
177
+ def reload!
178
+ Config.each_data_file(FILE) do |path|
179
+ next if @files.include?(path)
180
+
181
+ data = YAML.load_file(path)
182
+
183
+ unless data.kind_of?(Hash)
184
+ warn "#{path.dump} did not contain a Hash"
185
+ next
186
+ end
187
+
188
+ data.each do |name,strings|
189
+ @user_agents[name.to_sym].merge(strings)
190
+ end
191
+ end
192
+ end
193
+
194
+ end
195
+ end
196
+ end
@@ -23,6 +23,6 @@
23
23
  module Ronin
24
24
  module Web
25
25
  # Ronin Web Version
26
- VERSION = '0.3.0.pre2'
26
+ VERSION = '0.3.0.rc1'
27
27
  end
28
28
  end
@@ -20,12 +20,13 @@
20
20
  # along with Ronin. If not, see <http://www.gnu.org/licenses/>.
21
21
  #
22
22
 
23
+ require 'ronin/web/user_agents'
24
+ require 'ronin/web/mechanize'
23
25
  require 'ronin/network/http/proxy'
24
26
  require 'ronin/network/http/http'
25
27
 
26
28
  require 'uri/http'
27
29
  require 'nokogiri'
28
- require 'mechanize'
29
30
  require 'open-uri'
30
31
 
31
32
  module Ronin
@@ -168,12 +169,29 @@ module Ronin
168
169
  @proxy = Network::HTTP::Proxy.create(new_proxy)
169
170
  end
170
171
 
172
+ #
173
+ # A set of common `User-Agent` strings.
174
+ #
175
+ # @return [UserAgents]
176
+ # The set of `User-Agent` strings.
177
+ #
178
+ # @since 0.3.0
179
+ #
180
+ # @api public
181
+ #
182
+ def Web.user_agents
183
+ @user_agents ||= UserAgents.new
184
+ end
185
+
171
186
  #
172
187
  # @return [Array]
173
188
  # The supported Web User-Agent Aliases.
174
189
  #
175
190
  # @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant
176
191
  #
192
+ # @deprecated
193
+ # Will be replaced by {user_agents} in 1.0.0.
194
+ #
177
195
  # @api public
178
196
  #
179
197
  def Web.user_agent_aliases
@@ -195,18 +213,30 @@ module Ronin
195
213
  end
196
214
 
197
215
  #
198
- # Sets the User-Agent string used by {Web}.
216
+ # Sets the `User-Agent` string used by {Web}.
199
217
  #
200
- # @param [String] new_agent
218
+ # @param [String, Symbol, Regexp, nil] value
201
219
  # The User-Agent string to use.
220
+ # Setting {user_agent} to `nil` will disable the `User-Agent` string.
202
221
  #
203
222
  # @return [String]
204
223
  # The new User-Agent string.
205
224
  #
225
+ # @raise [RuntimeError]
226
+ # Either no User-Agent group exists with the given `Symbol`,
227
+ # or no User-Agent string matched the given `Regexp`.
228
+ #
206
229
  # @api public
207
230
  #
208
- def Web.user_agent=(new_agent)
209
- @user_agent = new_agent
231
+ def Web.user_agent=(value)
232
+ @user_agent = case value
233
+ when String
234
+ user_agents.fetch(value,value)
235
+ when nil
236
+ nil
237
+ else
238
+ user_agents.fetch(value)
239
+ end
210
240
  end
211
241
 
212
242
  #
@@ -220,6 +250,10 @@ module Ronin
220
250
  #
221
251
  # @see user_agent_aliases
222
252
  #
253
+ # @deprecated
254
+ # Will be replaced by calling {user_agent=} with a `Symbol`
255
+ # and will be removed in 1.0.0.
256
+ #
223
257
  # @api public
224
258
  #
225
259
  def Web.user_agent_alias=(name)
@@ -232,12 +266,12 @@ module Ronin
232
266
  # @param [Hash] options
233
267
  # Additional options.
234
268
  #
235
- # @option options [String] :user_agent_alias
236
- # The User-Agent Alias to use.
237
- #
238
269
  # @option options [String] :user_agent
239
270
  # The User-Agent string to use.
240
271
  #
272
+ # @option options [String] :user_agent_alias
273
+ # The User-Agent Alias to use.
274
+ #
241
275
  # @option options [Network::HTTP::Proxy, Hash, String] :proxy
242
276
  # (Web.proxy)
243
277
  # Proxy information.
@@ -309,68 +343,21 @@ module Ronin
309
343
  end
310
344
 
311
345
  #
312
- # Creates a new Mechanize Agent.
313
- #
314
- # @param [Hash] options
315
- # Additional options.
316
- #
317
- # @option options [String] :user_agent_alias
318
- # The User-Agent Alias to use.
319
- #
320
- # @option options [String] :user_agent
321
- # The User-Agent string to use.
322
- #
323
- # @option options [Network::HTTP::Proxy, Hash, String] :proxy
324
- # (Web.proxy)
325
- # Proxy information.
326
- #
327
- # @yield [agent]
328
- # If a block is given, it will be passed the newly created Mechanize
329
- # agent.
330
- #
331
- # @yieldparam [Mechanize] agent
332
- # The new Mechanize agent.
346
+ # A persistant Mechanize Agent.
333
347
  #
334
348
  # @return [Mechanize]
335
- # The new Mechanize agent.
349
+ # The persistant Mechanize Agent.
336
350
  #
337
- # @example Create a new agent.
338
- # Web.agent
339
- #
340
- # @example Create a new agent, with a custom User-Agent alias.
341
- # Web.agent(:user_agent_alias => 'Linux Mozilla')
342
- #
343
- # @example Create a new agent, with a custom User-Agent string.
344
- # Web.agent(:user_agent => 'wooden pants')
345
- #
346
- # @see http://rubydoc.info/gems/mechanize/Mechanize
351
+ # @see Mechanize
347
352
  #
348
353
  # @api public
349
354
  #
350
355
  def Web.agent(options={})
351
- agent = Mechanize.new
352
-
353
- if options[:user_agent_alias]
354
- agent.user_agent_alias = options[:user_agent_alias]
355
- elsif options[:user_agent]
356
- agent.user_agent = options[:user_agent]
357
- elsif Web.user_agent
358
- agent.user_agent = Web.user_agent
359
- end
360
-
361
- proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy)
362
-
363
- if proxy[:host]
364
- agent.set_proxy(
365
- proxy[:host],
366
- proxy[:port],
367
- proxy[:user],
368
- proxy[:password]
369
- )
356
+ if options.empty?
357
+ @agent ||= Mechanize.new(options)
358
+ else
359
+ @agent = Mechanize.new(options)
370
360
  end
371
-
372
- yield agent if block_given?
373
- return agent
374
361
  end
375
362
 
376
363
  #
@@ -382,12 +369,12 @@ module Ronin
382
369
  # @param [Hash] options
383
370
  # Additional options.
384
371
  #
385
- # @option options [String] :user_agent_alias
386
- # The User-Agent Alias to use.
387
- #
388
372
  # @option options [String] :user_agent
389
373
  # The User-Agent string to use.
390
374
  #
375
+ # @option options [String] :user_agent_alias
376
+ # The User-Agent Alias to use.
377
+ #
391
378
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
392
379
  # Proxy information.
393
380
  #
@@ -433,12 +420,12 @@ module Ronin
433
420
  # @param [Hash] options
434
421
  # Additional options.
435
422
  #
436
- # @option options [String] :user_agent_alias
437
- # The User-Agent Alias to use.
438
- #
439
423
  # @option options [String] :user_agent
440
424
  # The User-Agent string to use.
441
425
  #
426
+ # @option options [String] :user_agent_alias
427
+ # The User-Agent Alias to use.
428
+ #
442
429
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
443
430
  # Proxy information.
444
431
  #
@@ -482,12 +469,12 @@ module Ronin
482
469
  # @option options [Hash] :query
483
470
  # Additional query parameters to post with.
484
471
  #
485
- # @option options [String] :user_agent_alias
486
- # The User-Agent Alia to use.
487
- #
488
472
  # @option options [String] :user_agent
489
473
  # The User-Agent string to use.
490
474
  #
475
+ # @option options [String] :user_agent_alias
476
+ # The User-Agent Alia to use.
477
+ #
491
478
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
492
479
  # Proxy information.
493
480
  #
@@ -532,12 +519,12 @@ module Ronin
532
519
  # @option options [Hash] :query
533
520
  # Additional query parameters to post with.
534
521
  #
535
- # @option options [String] :user_agent_alias
536
- # The User-Agent Alias to use.
537
- #
538
522
  # @option options [String] :user_agent
539
523
  # The User-Agent string to use.
540
524
  #
525
+ # @option options [String] :user_agent_alias
526
+ # The User-Agent Alias to use.
527
+ #
541
528
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
542
529
  # Proxy information.
543
530
  #