ronin-web 0.3.0.pre2 → 0.3.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog.md +3 -0
- data/Gemfile +1 -1
- data/README.md +2 -1
- data/Rakefile +4 -4
- data/bin/ronin-web +2 -2
- data/data/ronin/web/user_agents.yml +247 -0
- data/gemspec.yml +3 -6
- data/lib/ronin/network/mixins/web.rb +3 -1
- data/lib/ronin/web/config.rb +34 -0
- data/lib/ronin/web/mechanize.rb +81 -0
- data/lib/ronin/web/spider.rb +7 -2
- data/lib/ronin/web/user_agents.rb +196 -0
- data/lib/ronin/web/version.rb +1 -1
- data/lib/ronin/web/web.rb +61 -74
- data/ronin-web.gemspec +129 -13
- data/spec/web/helpers/rack_app.rb +1 -8
- data/spec/web/mechanize_spec.rb +62 -0
- data/spec/web/user_agents_spec.rb +56 -0
- data/spec/web/web_spec.rb +2 -58
- metadata +14 -6
data/lib/ronin/web/spider.rb
CHANGED
@@ -85,6 +85,9 @@ module Ronin
|
|
85
85
|
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
86
86
|
# The patterns which match the URI path extensions to not visit.
|
87
87
|
#
|
88
|
+
# @option options [Boolean] :verbose (true)
|
89
|
+
# Specifies whether every URL will be printed.
|
90
|
+
#
|
88
91
|
# @yield [spider]
|
89
92
|
# If a block is given, it will be passed the newly created spider.
|
90
93
|
#
|
@@ -103,8 +106,10 @@ module Ronin
|
|
103
106
|
|
104
107
|
super(options)
|
105
108
|
|
106
|
-
|
107
|
-
|
109
|
+
if options.fetch(:verbose,true)
|
110
|
+
every_url do |url|
|
111
|
+
print_info("Spidering #{url}")
|
112
|
+
end
|
108
113
|
end
|
109
114
|
|
110
115
|
yield self if block_given?
|
@@ -0,0 +1,196 @@
|
|
1
|
+
#
|
2
|
+
# Ronin Web - A Ruby library for Ronin that provides support for web
|
3
|
+
# scraping and spidering functionality.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This file is part of Ronin Web.
|
8
|
+
#
|
9
|
+
# Ronin is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# Ronin is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with Ronin. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'ronin/web/config'
|
24
|
+
|
25
|
+
require 'set'
|
26
|
+
|
27
|
+
module Ronin
|
28
|
+
module Web
|
29
|
+
#
|
30
|
+
# Represents the set of `User-Agent` strings loaded from all
|
31
|
+
# `data/ronin/web/user_agents.yml` files.
|
32
|
+
#
|
33
|
+
# ## ronin/web/user_agents.yml
|
34
|
+
#
|
35
|
+
# The `user_agent.yml` files are essentially YAML files listing
|
36
|
+
# `User-Agent` strings grouped by category:
|
37
|
+
#
|
38
|
+
# ---
|
39
|
+
# :googlebot:
|
40
|
+
# - "Googlebot/2.1 ( http://www.googlebot.com/bot.html)"
|
41
|
+
# - "Googlebot-Image/1.0 ( http://www.googlebot.com/bot.html)"
|
42
|
+
# - "Mediapartners-Google/2.1"
|
43
|
+
# - "Google-Sitemaps/1.0"
|
44
|
+
#
|
45
|
+
# These files can be added to Ronin Repositories or to Ronin libraries,
|
46
|
+
# and will be loaded by the {UserAgents} objects.
|
47
|
+
#
|
48
|
+
# @since 0.3.0
|
49
|
+
#
|
50
|
+
class UserAgents
|
51
|
+
|
52
|
+
include Enumerable
|
53
|
+
|
54
|
+
# Relative path to the User-Agents file.
|
55
|
+
FILE = File.join('ronin','web','user_agents.yml')
|
56
|
+
|
57
|
+
#
|
58
|
+
# Creates a new User-Agent set.
|
59
|
+
#
|
60
|
+
# @api semipublic
|
61
|
+
#
|
62
|
+
def initialize
|
63
|
+
@files = Set[]
|
64
|
+
@user_agents = Hash.new { |hash,key| hash[key] = Set[] }
|
65
|
+
end
|
66
|
+
|
67
|
+
#
|
68
|
+
# The categories of `User-Agent` strings.
|
69
|
+
#
|
70
|
+
# @return [Array<Symbol>]
|
71
|
+
# The names of the categories.
|
72
|
+
#
|
73
|
+
# @api public
|
74
|
+
#
|
75
|
+
def categories
|
76
|
+
reload!
|
77
|
+
|
78
|
+
@user_agents.keys
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Iterates over each User-Agent in the set.
|
83
|
+
#
|
84
|
+
# @yield [ua]
|
85
|
+
# The given block will be passed each User-Agent.
|
86
|
+
#
|
87
|
+
# @yieldparam [String] ua
|
88
|
+
# A User-Agent string within the set.
|
89
|
+
#
|
90
|
+
# @return [Enumerator]
|
91
|
+
# If no block is given, an Enmerator will be returned.
|
92
|
+
#
|
93
|
+
# @api public
|
94
|
+
#
|
95
|
+
def each(&block)
|
96
|
+
return enum_for(:each) unless block_given?
|
97
|
+
|
98
|
+
@user_agents.each do |name,strings|
|
99
|
+
strings.each(&block)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Selects a `User-Agent` string from the set.
|
105
|
+
#
|
106
|
+
# @param [Symbol, String, Regexp] key
|
107
|
+
# The User-Agents group name, sub-string or Regexp to search for.
|
108
|
+
#
|
109
|
+
# @return [String, nil]
|
110
|
+
# The matching `User-Agent` string.
|
111
|
+
#
|
112
|
+
# @api public
|
113
|
+
#
|
114
|
+
def [](key)
|
115
|
+
reload!
|
116
|
+
|
117
|
+
case key
|
118
|
+
when Symbol
|
119
|
+
if @user_agents.has_key?(key)
|
120
|
+
strings = @user_agents[key]
|
121
|
+
return strings.entries[rand(strings.length)]
|
122
|
+
end
|
123
|
+
when String
|
124
|
+
@user_agents.each do |name,strings|
|
125
|
+
strings.each do |string|
|
126
|
+
return string if string.include?(key)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
return nil
|
131
|
+
when Regexp
|
132
|
+
@user_agents.each do |name,strings|
|
133
|
+
strings.each do |string|
|
134
|
+
return string if string =~ key
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
return nil
|
139
|
+
else
|
140
|
+
raise(TypeError,"key must be a Symbol, String or Regexp")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Fetches a `User-Agent` string from the set.
|
146
|
+
#
|
147
|
+
# @param [Symbol, String, Regexp] key
|
148
|
+
# The User-Agents group name, sub-string or Regexp to search for.
|
149
|
+
#
|
150
|
+
# @param [String] default
|
151
|
+
# The `User-Agent` string to default to if no match is found.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
# The matching `User-Agent` string.
|
155
|
+
#
|
156
|
+
# @raise [ArgumentError]
|
157
|
+
# No matching `User-Agent` string was found, and no default value
|
158
|
+
# was given.
|
159
|
+
#
|
160
|
+
# @api public
|
161
|
+
#
|
162
|
+
def fetch(key,default=nil)
|
163
|
+
unless (string = (self[key] || default))
|
164
|
+
raise(ArgumentError,"no User-Agent strings match #{key.inspect}")
|
165
|
+
end
|
166
|
+
|
167
|
+
return string
|
168
|
+
end
|
169
|
+
|
170
|
+
protected
|
171
|
+
|
172
|
+
#
|
173
|
+
# Reloads the set of User-Agents.
|
174
|
+
#
|
175
|
+
# @api private
|
176
|
+
#
|
177
|
+
def reload!
|
178
|
+
Config.each_data_file(FILE) do |path|
|
179
|
+
next if @files.include?(path)
|
180
|
+
|
181
|
+
data = YAML.load_file(path)
|
182
|
+
|
183
|
+
unless data.kind_of?(Hash)
|
184
|
+
warn "#{path.dump} did not contain a Hash"
|
185
|
+
next
|
186
|
+
end
|
187
|
+
|
188
|
+
data.each do |name,strings|
|
189
|
+
@user_agents[name.to_sym].merge(strings)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
data/lib/ronin/web/version.rb
CHANGED
data/lib/ronin/web/web.rb
CHANGED
@@ -20,12 +20,13 @@
|
|
20
20
|
# along with Ronin. If not, see <http://www.gnu.org/licenses/>.
|
21
21
|
#
|
22
22
|
|
23
|
+
require 'ronin/web/user_agents'
|
24
|
+
require 'ronin/web/mechanize'
|
23
25
|
require 'ronin/network/http/proxy'
|
24
26
|
require 'ronin/network/http/http'
|
25
27
|
|
26
28
|
require 'uri/http'
|
27
29
|
require 'nokogiri'
|
28
|
-
require 'mechanize'
|
29
30
|
require 'open-uri'
|
30
31
|
|
31
32
|
module Ronin
|
@@ -168,12 +169,29 @@ module Ronin
|
|
168
169
|
@proxy = Network::HTTP::Proxy.create(new_proxy)
|
169
170
|
end
|
170
171
|
|
172
|
+
#
|
173
|
+
# A set of common `User-Agent` strings.
|
174
|
+
#
|
175
|
+
# @return [UserAgents]
|
176
|
+
# The set of `User-Agent` strings.
|
177
|
+
#
|
178
|
+
# @since 0.3.0
|
179
|
+
#
|
180
|
+
# @api public
|
181
|
+
#
|
182
|
+
def Web.user_agents
|
183
|
+
@user_agents ||= UserAgents.new
|
184
|
+
end
|
185
|
+
|
171
186
|
#
|
172
187
|
# @return [Array]
|
173
188
|
# The supported Web User-Agent Aliases.
|
174
189
|
#
|
175
190
|
# @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant
|
176
191
|
#
|
192
|
+
# @deprecated
|
193
|
+
# Will be replaced by {user_agents} in 1.0.0.
|
194
|
+
#
|
177
195
|
# @api public
|
178
196
|
#
|
179
197
|
def Web.user_agent_aliases
|
@@ -195,18 +213,30 @@ module Ronin
|
|
195
213
|
end
|
196
214
|
|
197
215
|
#
|
198
|
-
# Sets the User-Agent string used by {Web}.
|
216
|
+
# Sets the `User-Agent` string used by {Web}.
|
199
217
|
#
|
200
|
-
# @param [String]
|
218
|
+
# @param [String, Symbol, Regexp, nil] value
|
201
219
|
# The User-Agent string to use.
|
220
|
+
# Setting {user_agent} to `nil` will disable the `User-Agent` string.
|
202
221
|
#
|
203
222
|
# @return [String]
|
204
223
|
# The new User-Agent string.
|
205
224
|
#
|
225
|
+
# @raise [RuntimeError]
|
226
|
+
# Either no User-Agent group exists with the given `Symbol`,
|
227
|
+
# or no User-Agent string matched the given `Regexp`.
|
228
|
+
#
|
206
229
|
# @api public
|
207
230
|
#
|
208
|
-
def Web.user_agent=(
|
209
|
-
@user_agent =
|
231
|
+
def Web.user_agent=(value)
|
232
|
+
@user_agent = case value
|
233
|
+
when String
|
234
|
+
user_agents.fetch(value,value)
|
235
|
+
when nil
|
236
|
+
nil
|
237
|
+
else
|
238
|
+
user_agents.fetch(value)
|
239
|
+
end
|
210
240
|
end
|
211
241
|
|
212
242
|
#
|
@@ -220,6 +250,10 @@ module Ronin
|
|
220
250
|
#
|
221
251
|
# @see user_agent_aliases
|
222
252
|
#
|
253
|
+
# @deprecated
|
254
|
+
# Will be replaced by calling {user_agent=} with a `Symbol`
|
255
|
+
# and will be removed in 1.0.0.
|
256
|
+
#
|
223
257
|
# @api public
|
224
258
|
#
|
225
259
|
def Web.user_agent_alias=(name)
|
@@ -232,12 +266,12 @@ module Ronin
|
|
232
266
|
# @param [Hash] options
|
233
267
|
# Additional options.
|
234
268
|
#
|
235
|
-
# @option options [String] :user_agent_alias
|
236
|
-
# The User-Agent Alias to use.
|
237
|
-
#
|
238
269
|
# @option options [String] :user_agent
|
239
270
|
# The User-Agent string to use.
|
240
271
|
#
|
272
|
+
# @option options [String] :user_agent_alias
|
273
|
+
# The User-Agent Alias to use.
|
274
|
+
#
|
241
275
|
# @option options [Network::HTTP::Proxy, Hash, String] :proxy
|
242
276
|
# (Web.proxy)
|
243
277
|
# Proxy information.
|
@@ -309,68 +343,21 @@ module Ronin
|
|
309
343
|
end
|
310
344
|
|
311
345
|
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
# @param [Hash] options
|
315
|
-
# Additional options.
|
316
|
-
#
|
317
|
-
# @option options [String] :user_agent_alias
|
318
|
-
# The User-Agent Alias to use.
|
319
|
-
#
|
320
|
-
# @option options [String] :user_agent
|
321
|
-
# The User-Agent string to use.
|
322
|
-
#
|
323
|
-
# @option options [Network::HTTP::Proxy, Hash, String] :proxy
|
324
|
-
# (Web.proxy)
|
325
|
-
# Proxy information.
|
326
|
-
#
|
327
|
-
# @yield [agent]
|
328
|
-
# If a block is given, it will be passed the newly created Mechanize
|
329
|
-
# agent.
|
330
|
-
#
|
331
|
-
# @yieldparam [Mechanize] agent
|
332
|
-
# The new Mechanize agent.
|
346
|
+
# A persistant Mechanize Agent.
|
333
347
|
#
|
334
348
|
# @return [Mechanize]
|
335
|
-
# The
|
349
|
+
# The persistant Mechanize Agent.
|
336
350
|
#
|
337
|
-
# @
|
338
|
-
# Web.agent
|
339
|
-
#
|
340
|
-
# @example Create a new agent, with a custom User-Agent alias.
|
341
|
-
# Web.agent(:user_agent_alias => 'Linux Mozilla')
|
342
|
-
#
|
343
|
-
# @example Create a new agent, with a custom User-Agent string.
|
344
|
-
# Web.agent(:user_agent => 'wooden pants')
|
345
|
-
#
|
346
|
-
# @see http://rubydoc.info/gems/mechanize/Mechanize
|
351
|
+
# @see Mechanize
|
347
352
|
#
|
348
353
|
# @api public
|
349
354
|
#
|
350
355
|
def Web.agent(options={})
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
agent
|
355
|
-
elsif options[:user_agent]
|
356
|
-
agent.user_agent = options[:user_agent]
|
357
|
-
elsif Web.user_agent
|
358
|
-
agent.user_agent = Web.user_agent
|
359
|
-
end
|
360
|
-
|
361
|
-
proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy)
|
362
|
-
|
363
|
-
if proxy[:host]
|
364
|
-
agent.set_proxy(
|
365
|
-
proxy[:host],
|
366
|
-
proxy[:port],
|
367
|
-
proxy[:user],
|
368
|
-
proxy[:password]
|
369
|
-
)
|
356
|
+
if options.empty?
|
357
|
+
@agent ||= Mechanize.new(options)
|
358
|
+
else
|
359
|
+
@agent = Mechanize.new(options)
|
370
360
|
end
|
371
|
-
|
372
|
-
yield agent if block_given?
|
373
|
-
return agent
|
374
361
|
end
|
375
362
|
|
376
363
|
#
|
@@ -382,12 +369,12 @@ module Ronin
|
|
382
369
|
# @param [Hash] options
|
383
370
|
# Additional options.
|
384
371
|
#
|
385
|
-
# @option options [String] :user_agent_alias
|
386
|
-
# The User-Agent Alias to use.
|
387
|
-
#
|
388
372
|
# @option options [String] :user_agent
|
389
373
|
# The User-Agent string to use.
|
390
374
|
#
|
375
|
+
# @option options [String] :user_agent_alias
|
376
|
+
# The User-Agent Alias to use.
|
377
|
+
#
|
391
378
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
392
379
|
# Proxy information.
|
393
380
|
#
|
@@ -433,12 +420,12 @@ module Ronin
|
|
433
420
|
# @param [Hash] options
|
434
421
|
# Additional options.
|
435
422
|
#
|
436
|
-
# @option options [String] :user_agent_alias
|
437
|
-
# The User-Agent Alias to use.
|
438
|
-
#
|
439
423
|
# @option options [String] :user_agent
|
440
424
|
# The User-Agent string to use.
|
441
425
|
#
|
426
|
+
# @option options [String] :user_agent_alias
|
427
|
+
# The User-Agent Alias to use.
|
428
|
+
#
|
442
429
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
443
430
|
# Proxy information.
|
444
431
|
#
|
@@ -482,12 +469,12 @@ module Ronin
|
|
482
469
|
# @option options [Hash] :query
|
483
470
|
# Additional query parameters to post with.
|
484
471
|
#
|
485
|
-
# @option options [String] :user_agent_alias
|
486
|
-
# The User-Agent Alia to use.
|
487
|
-
#
|
488
472
|
# @option options [String] :user_agent
|
489
473
|
# The User-Agent string to use.
|
490
474
|
#
|
475
|
+
# @option options [String] :user_agent_alias
|
476
|
+
# The User-Agent Alia to use.
|
477
|
+
#
|
491
478
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
492
479
|
# Proxy information.
|
493
480
|
#
|
@@ -532,12 +519,12 @@ module Ronin
|
|
532
519
|
# @option options [Hash] :query
|
533
520
|
# Additional query parameters to post with.
|
534
521
|
#
|
535
|
-
# @option options [String] :user_agent_alias
|
536
|
-
# The User-Agent Alias to use.
|
537
|
-
#
|
538
522
|
# @option options [String] :user_agent
|
539
523
|
# The User-Agent string to use.
|
540
524
|
#
|
525
|
+
# @option options [String] :user_agent_alias
|
526
|
+
# The User-Agent Alias to use.
|
527
|
+
#
|
541
528
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
542
529
|
# Proxy information.
|
543
530
|
#
|