ronin-web 0.3.0.pre2 → 0.3.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -85,6 +85,9 @@ module Ronin
85
85
  # @option options [Array<String, Regexp, Proc>] :ignore_exts
86
86
  # The patterns which match the URI path extensions to not visit.
87
87
  #
88
+ # @option options [Boolean] :verbose (true)
89
+ # Specifies whether every URL will be printed.
90
+ #
88
91
  # @yield [spider]
89
92
  # If a block is given, it will be passed the newly created spider.
90
93
  #
@@ -103,8 +106,10 @@ module Ronin
103
106
 
104
107
  super(options)
105
108
 
106
- every_url do |url|
107
- print_info("Spidering #{url}")
109
+ if options.fetch(:verbose,true)
110
+ every_url do |url|
111
+ print_info("Spidering #{url}")
112
+ end
108
113
  end
109
114
 
110
115
  yield self if block_given?
@@ -0,0 +1,196 @@
1
+ #
2
+ # Ronin Web - A Ruby library for Ronin that provides support for web
3
+ # scraping and spidering functionality.
4
+ #
5
+ # Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This file is part of Ronin Web.
8
+ #
9
+ # Ronin is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # Ronin is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with Ronin. If not, see <http://www.gnu.org/licenses/>.
21
+ #
22
+
23
+ require 'ronin/web/config'
24
+
25
+ require 'set'
26
+
27
+ module Ronin
28
+ module Web
29
+ #
30
+ # Represents the set of `User-Agent` strings loaded from all
31
+ # `data/ronin/web/user_agents.yml` files.
32
+ #
33
+ # ## ronin/web/user_agents.yml
34
+ #
35
+ # The `user_agent.yml` files are essentially YAML files listing
36
+ # `User-Agent` strings grouped by category:
37
+ #
38
+ # ---
39
+ # :googlebot:
40
+ # - "Googlebot/2.1 ( http://www.googlebot.com/bot.html)"
41
+ # - "Googlebot-Image/1.0 ( http://www.googlebot.com/bot.html)"
42
+ # - "Mediapartners-Google/2.1"
43
+ # - "Google-Sitemaps/1.0"
44
+ #
45
+ # These files can be added to Ronin Repositories or to Ronin libraries,
46
+ # and will be loaded by the {UserAgents} objects.
47
+ #
48
+ # @since 0.3.0
49
+ #
50
+ class UserAgents
51
+
52
+ include Enumerable
53
+
54
+ # Relative path to the User-Agents file.
55
+ FILE = File.join('ronin','web','user_agents.yml')
56
+
57
+ #
58
+ # Creates a new User-Agent set.
59
+ #
60
+ # @api semipublic
61
+ #
62
+ def initialize
63
+ @files = Set[]
64
+ @user_agents = Hash.new { |hash,key| hash[key] = Set[] }
65
+ end
66
+
67
+ #
68
+ # The categories of `User-Agent` strings.
69
+ #
70
+ # @return [Array<Symbol>]
71
+ # The names of the categories.
72
+ #
73
+ # @api public
74
+ #
75
+ def categories
76
+ reload!
77
+
78
+ @user_agents.keys
79
+ end
80
+
81
+ #
82
+ # Iterates over each User-Agent in the set.
83
+ #
84
+ # @yield [ua]
85
+ # The given block will be passed each User-Agent.
86
+ #
87
+ # @yieldparam [String] ua
88
+ # A User-Agent string within the set.
89
+ #
90
+ # @return [Enumerator]
91
+ # If no block is given, an Enmerator will be returned.
92
+ #
93
+ # @api public
94
+ #
95
+ def each(&block)
96
+ return enum_for(:each) unless block_given?
97
+
98
+ @user_agents.each do |name,strings|
99
+ strings.each(&block)
100
+ end
101
+ end
102
+
103
+ #
104
+ # Selects a `User-Agent` string from the set.
105
+ #
106
+ # @param [Symbol, String, Regexp] key
107
+ # The User-Agents group name, sub-string or Regexp to search for.
108
+ #
109
+ # @return [String, nil]
110
+ # The matching `User-Agent` string.
111
+ #
112
+ # @api public
113
+ #
114
+ def [](key)
115
+ reload!
116
+
117
+ case key
118
+ when Symbol
119
+ if @user_agents.has_key?(key)
120
+ strings = @user_agents[key]
121
+ return strings.entries[rand(strings.length)]
122
+ end
123
+ when String
124
+ @user_agents.each do |name,strings|
125
+ strings.each do |string|
126
+ return string if string.include?(key)
127
+ end
128
+ end
129
+
130
+ return nil
131
+ when Regexp
132
+ @user_agents.each do |name,strings|
133
+ strings.each do |string|
134
+ return string if string =~ key
135
+ end
136
+ end
137
+
138
+ return nil
139
+ else
140
+ raise(TypeError,"key must be a Symbol, String or Regexp")
141
+ end
142
+ end
143
+
144
+ #
145
+ # Fetches a `User-Agent` string from the set.
146
+ #
147
+ # @param [Symbol, String, Regexp] key
148
+ # The User-Agents group name, sub-string or Regexp to search for.
149
+ #
150
+ # @param [String] default
151
+ # The `User-Agent` string to default to if no match is found.
152
+ #
153
+ # @return [String]
154
+ # The matching `User-Agent` string.
155
+ #
156
+ # @raise [ArgumentError]
157
+ # No matching `User-Agent` string was found, and no default value
158
+ # was given.
159
+ #
160
+ # @api public
161
+ #
162
+ def fetch(key,default=nil)
163
+ unless (string = (self[key] || default))
164
+ raise(ArgumentError,"no User-Agent strings match #{key.inspect}")
165
+ end
166
+
167
+ return string
168
+ end
169
+
170
+ protected
171
+
172
+ #
173
+ # Reloads the set of User-Agents.
174
+ #
175
+ # @api private
176
+ #
177
+ def reload!
178
+ Config.each_data_file(FILE) do |path|
179
+ next if @files.include?(path)
180
+
181
+ data = YAML.load_file(path)
182
+
183
+ unless data.kind_of?(Hash)
184
+ warn "#{path.dump} did not contain a Hash"
185
+ next
186
+ end
187
+
188
+ data.each do |name,strings|
189
+ @user_agents[name.to_sym].merge(strings)
190
+ end
191
+ end
192
+ end
193
+
194
+ end
195
+ end
196
+ end
@@ -23,6 +23,6 @@
23
23
  module Ronin
24
24
  module Web
25
25
  # Ronin Web Version
26
- VERSION = '0.3.0.pre2'
26
+ VERSION = '0.3.0.rc1'
27
27
  end
28
28
  end
@@ -20,12 +20,13 @@
20
20
  # along with Ronin. If not, see <http://www.gnu.org/licenses/>.
21
21
  #
22
22
 
23
+ require 'ronin/web/user_agents'
24
+ require 'ronin/web/mechanize'
23
25
  require 'ronin/network/http/proxy'
24
26
  require 'ronin/network/http/http'
25
27
 
26
28
  require 'uri/http'
27
29
  require 'nokogiri'
28
- require 'mechanize'
29
30
  require 'open-uri'
30
31
 
31
32
  module Ronin
@@ -168,12 +169,29 @@ module Ronin
168
169
  @proxy = Network::HTTP::Proxy.create(new_proxy)
169
170
  end
170
171
 
172
+ #
173
+ # A set of common `User-Agent` strings.
174
+ #
175
+ # @return [UserAgents]
176
+ # The set of `User-Agent` strings.
177
+ #
178
+ # @since 0.3.0
179
+ #
180
+ # @api public
181
+ #
182
+ def Web.user_agents
183
+ @user_agents ||= UserAgents.new
184
+ end
185
+
171
186
  #
172
187
  # @return [Array]
173
188
  # The supported Web User-Agent Aliases.
174
189
  #
175
190
  # @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant
176
191
  #
192
+ # @deprecated
193
+ # Will be replaced by {user_agents} in 1.0.0.
194
+ #
177
195
  # @api public
178
196
  #
179
197
  def Web.user_agent_aliases
@@ -195,18 +213,30 @@ module Ronin
195
213
  end
196
214
 
197
215
  #
198
- # Sets the User-Agent string used by {Web}.
216
+ # Sets the `User-Agent` string used by {Web}.
199
217
  #
200
- # @param [String] new_agent
218
+ # @param [String, Symbol, Regexp, nil] value
201
219
  # The User-Agent string to use.
220
+ # Setting {user_agent} to `nil` will disable the `User-Agent` string.
202
221
  #
203
222
  # @return [String]
204
223
  # The new User-Agent string.
205
224
  #
225
+ # @raise [RuntimeError]
226
+ # Either no User-Agent group exists with the given `Symbol`,
227
+ # or no User-Agent string matched the given `Regexp`.
228
+ #
206
229
  # @api public
207
230
  #
208
- def Web.user_agent=(new_agent)
209
- @user_agent = new_agent
231
+ def Web.user_agent=(value)
232
+ @user_agent = case value
233
+ when String
234
+ user_agents.fetch(value,value)
235
+ when nil
236
+ nil
237
+ else
238
+ user_agents.fetch(value)
239
+ end
210
240
  end
211
241
 
212
242
  #
@@ -220,6 +250,10 @@ module Ronin
220
250
  #
221
251
  # @see user_agent_aliases
222
252
  #
253
+ # @deprecated
254
+ # Will be replaced by calling {user_agent=} with a `Symbol`
255
+ # and will be removed in 1.0.0.
256
+ #
223
257
  # @api public
224
258
  #
225
259
  def Web.user_agent_alias=(name)
@@ -232,12 +266,12 @@ module Ronin
232
266
  # @param [Hash] options
233
267
  # Additional options.
234
268
  #
235
- # @option options [String] :user_agent_alias
236
- # The User-Agent Alias to use.
237
- #
238
269
  # @option options [String] :user_agent
239
270
  # The User-Agent string to use.
240
271
  #
272
+ # @option options [String] :user_agent_alias
273
+ # The User-Agent Alias to use.
274
+ #
241
275
  # @option options [Network::HTTP::Proxy, Hash, String] :proxy
242
276
  # (Web.proxy)
243
277
  # Proxy information.
@@ -309,68 +343,21 @@ module Ronin
309
343
  end
310
344
 
311
345
  #
312
- # Creates a new Mechanize Agent.
313
- #
314
- # @param [Hash] options
315
- # Additional options.
316
- #
317
- # @option options [String] :user_agent_alias
318
- # The User-Agent Alias to use.
319
- #
320
- # @option options [String] :user_agent
321
- # The User-Agent string to use.
322
- #
323
- # @option options [Network::HTTP::Proxy, Hash, String] :proxy
324
- # (Web.proxy)
325
- # Proxy information.
326
- #
327
- # @yield [agent]
328
- # If a block is given, it will be passed the newly created Mechanize
329
- # agent.
330
- #
331
- # @yieldparam [Mechanize] agent
332
- # The new Mechanize agent.
346
+ # A persistant Mechanize Agent.
333
347
  #
334
348
  # @return [Mechanize]
335
- # The new Mechanize agent.
349
+ # The persistant Mechanize Agent.
336
350
  #
337
- # @example Create a new agent.
338
- # Web.agent
339
- #
340
- # @example Create a new agent, with a custom User-Agent alias.
341
- # Web.agent(:user_agent_alias => 'Linux Mozilla')
342
- #
343
- # @example Create a new agent, with a custom User-Agent string.
344
- # Web.agent(:user_agent => 'wooden pants')
345
- #
346
- # @see http://rubydoc.info/gems/mechanize/Mechanize
351
+ # @see Mechanize
347
352
  #
348
353
  # @api public
349
354
  #
350
355
  def Web.agent(options={})
351
- agent = Mechanize.new
352
-
353
- if options[:user_agent_alias]
354
- agent.user_agent_alias = options[:user_agent_alias]
355
- elsif options[:user_agent]
356
- agent.user_agent = options[:user_agent]
357
- elsif Web.user_agent
358
- agent.user_agent = Web.user_agent
359
- end
360
-
361
- proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy)
362
-
363
- if proxy[:host]
364
- agent.set_proxy(
365
- proxy[:host],
366
- proxy[:port],
367
- proxy[:user],
368
- proxy[:password]
369
- )
356
+ if options.empty?
357
+ @agent ||= Mechanize.new(options)
358
+ else
359
+ @agent = Mechanize.new(options)
370
360
  end
371
-
372
- yield agent if block_given?
373
- return agent
374
361
  end
375
362
 
376
363
  #
@@ -382,12 +369,12 @@ module Ronin
382
369
  # @param [Hash] options
383
370
  # Additional options.
384
371
  #
385
- # @option options [String] :user_agent_alias
386
- # The User-Agent Alias to use.
387
- #
388
372
  # @option options [String] :user_agent
389
373
  # The User-Agent string to use.
390
374
  #
375
+ # @option options [String] :user_agent_alias
376
+ # The User-Agent Alias to use.
377
+ #
391
378
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
392
379
  # Proxy information.
393
380
  #
@@ -433,12 +420,12 @@ module Ronin
433
420
  # @param [Hash] options
434
421
  # Additional options.
435
422
  #
436
- # @option options [String] :user_agent_alias
437
- # The User-Agent Alias to use.
438
- #
439
423
  # @option options [String] :user_agent
440
424
  # The User-Agent string to use.
441
425
  #
426
+ # @option options [String] :user_agent_alias
427
+ # The User-Agent Alias to use.
428
+ #
442
429
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
443
430
  # Proxy information.
444
431
  #
@@ -482,12 +469,12 @@ module Ronin
482
469
  # @option options [Hash] :query
483
470
  # Additional query parameters to post with.
484
471
  #
485
- # @option options [String] :user_agent_alias
486
- # The User-Agent Alia to use.
487
- #
488
472
  # @option options [String] :user_agent
489
473
  # The User-Agent string to use.
490
474
  #
475
+ # @option options [String] :user_agent_alias
476
+ # The User-Agent Alia to use.
477
+ #
491
478
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
492
479
  # Proxy information.
493
480
  #
@@ -532,12 +519,12 @@ module Ronin
532
519
  # @option options [Hash] :query
533
520
  # Additional query parameters to post with.
534
521
  #
535
- # @option options [String] :user_agent_alias
536
- # The User-Agent Alias to use.
537
- #
538
522
  # @option options [String] :user_agent
539
523
  # The User-Agent string to use.
540
524
  #
525
+ # @option options [String] :user_agent_alias
526
+ # The User-Agent Alias to use.
527
+ #
541
528
  # @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
542
529
  # Proxy information.
543
530
  #