ronin-web 0.3.0.pre2 → 0.3.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog.md +3 -0
- data/Gemfile +1 -1
- data/README.md +2 -1
- data/Rakefile +4 -4
- data/bin/ronin-web +2 -2
- data/data/ronin/web/user_agents.yml +247 -0
- data/gemspec.yml +3 -6
- data/lib/ronin/network/mixins/web.rb +3 -1
- data/lib/ronin/web/config.rb +34 -0
- data/lib/ronin/web/mechanize.rb +81 -0
- data/lib/ronin/web/spider.rb +7 -2
- data/lib/ronin/web/user_agents.rb +196 -0
- data/lib/ronin/web/version.rb +1 -1
- data/lib/ronin/web/web.rb +61 -74
- data/ronin-web.gemspec +129 -13
- data/spec/web/helpers/rack_app.rb +1 -8
- data/spec/web/mechanize_spec.rb +62 -0
- data/spec/web/user_agents_spec.rb +56 -0
- data/spec/web/web_spec.rb +2 -58
- metadata +14 -6
data/lib/ronin/web/spider.rb
CHANGED
@@ -85,6 +85,9 @@ module Ronin
|
|
85
85
|
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
86
86
|
# The patterns which match the URI path extensions to not visit.
|
87
87
|
#
|
88
|
+
# @option options [Boolean] :verbose (true)
|
89
|
+
# Specifies whether every URL will be printed.
|
90
|
+
#
|
88
91
|
# @yield [spider]
|
89
92
|
# If a block is given, it will be passed the newly created spider.
|
90
93
|
#
|
@@ -103,8 +106,10 @@ module Ronin
|
|
103
106
|
|
104
107
|
super(options)
|
105
108
|
|
106
|
-
|
107
|
-
|
109
|
+
if options.fetch(:verbose,true)
|
110
|
+
every_url do |url|
|
111
|
+
print_info("Spidering #{url}")
|
112
|
+
end
|
108
113
|
end
|
109
114
|
|
110
115
|
yield self if block_given?
|
@@ -0,0 +1,196 @@
|
|
1
|
+
#
|
2
|
+
# Ronin Web - A Ruby library for Ronin that provides support for web
|
3
|
+
# scraping and spidering functionality.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This file is part of Ronin Web.
|
8
|
+
#
|
9
|
+
# Ronin is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# Ronin is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with Ronin. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'ronin/web/config'
|
24
|
+
|
25
|
+
require 'set'
|
26
|
+
|
27
|
+
module Ronin
|
28
|
+
module Web
|
29
|
+
#
|
30
|
+
# Represents the set of `User-Agent` strings loaded from all
|
31
|
+
# `data/ronin/web/user_agents.yml` files.
|
32
|
+
#
|
33
|
+
# ## ronin/web/user_agents.yml
|
34
|
+
#
|
35
|
+
# The `user_agent.yml` files are essentially YAML files listing
|
36
|
+
# `User-Agent` strings grouped by category:
|
37
|
+
#
|
38
|
+
# ---
|
39
|
+
# :googlebot:
|
40
|
+
# - "Googlebot/2.1 ( http://www.googlebot.com/bot.html)"
|
41
|
+
# - "Googlebot-Image/1.0 ( http://www.googlebot.com/bot.html)"
|
42
|
+
# - "Mediapartners-Google/2.1"
|
43
|
+
# - "Google-Sitemaps/1.0"
|
44
|
+
#
|
45
|
+
# These files can be added to Ronin Repositories or to Ronin libraries,
|
46
|
+
# and will be loaded by the {UserAgents} objects.
|
47
|
+
#
|
48
|
+
# @since 0.3.0
|
49
|
+
#
|
50
|
+
class UserAgents
|
51
|
+
|
52
|
+
include Enumerable
|
53
|
+
|
54
|
+
# Relative path to the User-Agents file.
|
55
|
+
FILE = File.join('ronin','web','user_agents.yml')
|
56
|
+
|
57
|
+
#
|
58
|
+
# Creates a new User-Agent set.
|
59
|
+
#
|
60
|
+
# @api semipublic
|
61
|
+
#
|
62
|
+
def initialize
|
63
|
+
@files = Set[]
|
64
|
+
@user_agents = Hash.new { |hash,key| hash[key] = Set[] }
|
65
|
+
end
|
66
|
+
|
67
|
+
#
|
68
|
+
# The categories of `User-Agent` strings.
|
69
|
+
#
|
70
|
+
# @return [Array<Symbol>]
|
71
|
+
# The names of the categories.
|
72
|
+
#
|
73
|
+
# @api public
|
74
|
+
#
|
75
|
+
def categories
|
76
|
+
reload!
|
77
|
+
|
78
|
+
@user_agents.keys
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Iterates over each User-Agent in the set.
|
83
|
+
#
|
84
|
+
# @yield [ua]
|
85
|
+
# The given block will be passed each User-Agent.
|
86
|
+
#
|
87
|
+
# @yieldparam [String] ua
|
88
|
+
# A User-Agent string within the set.
|
89
|
+
#
|
90
|
+
# @return [Enumerator]
|
91
|
+
# If no block is given, an Enmerator will be returned.
|
92
|
+
#
|
93
|
+
# @api public
|
94
|
+
#
|
95
|
+
def each(&block)
|
96
|
+
return enum_for(:each) unless block_given?
|
97
|
+
|
98
|
+
@user_agents.each do |name,strings|
|
99
|
+
strings.each(&block)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Selects a `User-Agent` string from the set.
|
105
|
+
#
|
106
|
+
# @param [Symbol, String, Regexp] key
|
107
|
+
# The User-Agents group name, sub-string or Regexp to search for.
|
108
|
+
#
|
109
|
+
# @return [String, nil]
|
110
|
+
# The matching `User-Agent` string.
|
111
|
+
#
|
112
|
+
# @api public
|
113
|
+
#
|
114
|
+
def [](key)
|
115
|
+
reload!
|
116
|
+
|
117
|
+
case key
|
118
|
+
when Symbol
|
119
|
+
if @user_agents.has_key?(key)
|
120
|
+
strings = @user_agents[key]
|
121
|
+
return strings.entries[rand(strings.length)]
|
122
|
+
end
|
123
|
+
when String
|
124
|
+
@user_agents.each do |name,strings|
|
125
|
+
strings.each do |string|
|
126
|
+
return string if string.include?(key)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
return nil
|
131
|
+
when Regexp
|
132
|
+
@user_agents.each do |name,strings|
|
133
|
+
strings.each do |string|
|
134
|
+
return string if string =~ key
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
return nil
|
139
|
+
else
|
140
|
+
raise(TypeError,"key must be a Symbol, String or Regexp")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Fetches a `User-Agent` string from the set.
|
146
|
+
#
|
147
|
+
# @param [Symbol, String, Regexp] key
|
148
|
+
# The User-Agents group name, sub-string or Regexp to search for.
|
149
|
+
#
|
150
|
+
# @param [String] default
|
151
|
+
# The `User-Agent` string to default to if no match is found.
|
152
|
+
#
|
153
|
+
# @return [String]
|
154
|
+
# The matching `User-Agent` string.
|
155
|
+
#
|
156
|
+
# @raise [ArgumentError]
|
157
|
+
# No matching `User-Agent` string was found, and no default value
|
158
|
+
# was given.
|
159
|
+
#
|
160
|
+
# @api public
|
161
|
+
#
|
162
|
+
def fetch(key,default=nil)
|
163
|
+
unless (string = (self[key] || default))
|
164
|
+
raise(ArgumentError,"no User-Agent strings match #{key.inspect}")
|
165
|
+
end
|
166
|
+
|
167
|
+
return string
|
168
|
+
end
|
169
|
+
|
170
|
+
protected
|
171
|
+
|
172
|
+
#
|
173
|
+
# Reloads the set of User-Agents.
|
174
|
+
#
|
175
|
+
# @api private
|
176
|
+
#
|
177
|
+
def reload!
|
178
|
+
Config.each_data_file(FILE) do |path|
|
179
|
+
next if @files.include?(path)
|
180
|
+
|
181
|
+
data = YAML.load_file(path)
|
182
|
+
|
183
|
+
unless data.kind_of?(Hash)
|
184
|
+
warn "#{path.dump} did not contain a Hash"
|
185
|
+
next
|
186
|
+
end
|
187
|
+
|
188
|
+
data.each do |name,strings|
|
189
|
+
@user_agents[name.to_sym].merge(strings)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
data/lib/ronin/web/version.rb
CHANGED
data/lib/ronin/web/web.rb
CHANGED
@@ -20,12 +20,13 @@
|
|
20
20
|
# along with Ronin. If not, see <http://www.gnu.org/licenses/>.
|
21
21
|
#
|
22
22
|
|
23
|
+
require 'ronin/web/user_agents'
|
24
|
+
require 'ronin/web/mechanize'
|
23
25
|
require 'ronin/network/http/proxy'
|
24
26
|
require 'ronin/network/http/http'
|
25
27
|
|
26
28
|
require 'uri/http'
|
27
29
|
require 'nokogiri'
|
28
|
-
require 'mechanize'
|
29
30
|
require 'open-uri'
|
30
31
|
|
31
32
|
module Ronin
|
@@ -168,12 +169,29 @@ module Ronin
|
|
168
169
|
@proxy = Network::HTTP::Proxy.create(new_proxy)
|
169
170
|
end
|
170
171
|
|
172
|
+
#
|
173
|
+
# A set of common `User-Agent` strings.
|
174
|
+
#
|
175
|
+
# @return [UserAgents]
|
176
|
+
# The set of `User-Agent` strings.
|
177
|
+
#
|
178
|
+
# @since 0.3.0
|
179
|
+
#
|
180
|
+
# @api public
|
181
|
+
#
|
182
|
+
def Web.user_agents
|
183
|
+
@user_agents ||= UserAgents.new
|
184
|
+
end
|
185
|
+
|
171
186
|
#
|
172
187
|
# @return [Array]
|
173
188
|
# The supported Web User-Agent Aliases.
|
174
189
|
#
|
175
190
|
# @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant
|
176
191
|
#
|
192
|
+
# @deprecated
|
193
|
+
# Will be replaced by {user_agents} in 1.0.0.
|
194
|
+
#
|
177
195
|
# @api public
|
178
196
|
#
|
179
197
|
def Web.user_agent_aliases
|
@@ -195,18 +213,30 @@ module Ronin
|
|
195
213
|
end
|
196
214
|
|
197
215
|
#
|
198
|
-
# Sets the User-Agent string used by {Web}.
|
216
|
+
# Sets the `User-Agent` string used by {Web}.
|
199
217
|
#
|
200
|
-
# @param [String]
|
218
|
+
# @param [String, Symbol, Regexp, nil] value
|
201
219
|
# The User-Agent string to use.
|
220
|
+
# Setting {user_agent} to `nil` will disable the `User-Agent` string.
|
202
221
|
#
|
203
222
|
# @return [String]
|
204
223
|
# The new User-Agent string.
|
205
224
|
#
|
225
|
+
# @raise [RuntimeError]
|
226
|
+
# Either no User-Agent group exists with the given `Symbol`,
|
227
|
+
# or no User-Agent string matched the given `Regexp`.
|
228
|
+
#
|
206
229
|
# @api public
|
207
230
|
#
|
208
|
-
def Web.user_agent=(
|
209
|
-
@user_agent =
|
231
|
+
def Web.user_agent=(value)
|
232
|
+
@user_agent = case value
|
233
|
+
when String
|
234
|
+
user_agents.fetch(value,value)
|
235
|
+
when nil
|
236
|
+
nil
|
237
|
+
else
|
238
|
+
user_agents.fetch(value)
|
239
|
+
end
|
210
240
|
end
|
211
241
|
|
212
242
|
#
|
@@ -220,6 +250,10 @@ module Ronin
|
|
220
250
|
#
|
221
251
|
# @see user_agent_aliases
|
222
252
|
#
|
253
|
+
# @deprecated
|
254
|
+
# Will be replaced by calling {user_agent=} with a `Symbol`
|
255
|
+
# and will be removed in 1.0.0.
|
256
|
+
#
|
223
257
|
# @api public
|
224
258
|
#
|
225
259
|
def Web.user_agent_alias=(name)
|
@@ -232,12 +266,12 @@ module Ronin
|
|
232
266
|
# @param [Hash] options
|
233
267
|
# Additional options.
|
234
268
|
#
|
235
|
-
# @option options [String] :user_agent_alias
|
236
|
-
# The User-Agent Alias to use.
|
237
|
-
#
|
238
269
|
# @option options [String] :user_agent
|
239
270
|
# The User-Agent string to use.
|
240
271
|
#
|
272
|
+
# @option options [String] :user_agent_alias
|
273
|
+
# The User-Agent Alias to use.
|
274
|
+
#
|
241
275
|
# @option options [Network::HTTP::Proxy, Hash, String] :proxy
|
242
276
|
# (Web.proxy)
|
243
277
|
# Proxy information.
|
@@ -309,68 +343,21 @@ module Ronin
|
|
309
343
|
end
|
310
344
|
|
311
345
|
#
|
312
|
-
#
|
313
|
-
#
|
314
|
-
# @param [Hash] options
|
315
|
-
# Additional options.
|
316
|
-
#
|
317
|
-
# @option options [String] :user_agent_alias
|
318
|
-
# The User-Agent Alias to use.
|
319
|
-
#
|
320
|
-
# @option options [String] :user_agent
|
321
|
-
# The User-Agent string to use.
|
322
|
-
#
|
323
|
-
# @option options [Network::HTTP::Proxy, Hash, String] :proxy
|
324
|
-
# (Web.proxy)
|
325
|
-
# Proxy information.
|
326
|
-
#
|
327
|
-
# @yield [agent]
|
328
|
-
# If a block is given, it will be passed the newly created Mechanize
|
329
|
-
# agent.
|
330
|
-
#
|
331
|
-
# @yieldparam [Mechanize] agent
|
332
|
-
# The new Mechanize agent.
|
346
|
+
# A persistant Mechanize Agent.
|
333
347
|
#
|
334
348
|
# @return [Mechanize]
|
335
|
-
# The
|
349
|
+
# The persistant Mechanize Agent.
|
336
350
|
#
|
337
|
-
# @
|
338
|
-
# Web.agent
|
339
|
-
#
|
340
|
-
# @example Create a new agent, with a custom User-Agent alias.
|
341
|
-
# Web.agent(:user_agent_alias => 'Linux Mozilla')
|
342
|
-
#
|
343
|
-
# @example Create a new agent, with a custom User-Agent string.
|
344
|
-
# Web.agent(:user_agent => 'wooden pants')
|
345
|
-
#
|
346
|
-
# @see http://rubydoc.info/gems/mechanize/Mechanize
|
351
|
+
# @see Mechanize
|
347
352
|
#
|
348
353
|
# @api public
|
349
354
|
#
|
350
355
|
def Web.agent(options={})
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
agent
|
355
|
-
elsif options[:user_agent]
|
356
|
-
agent.user_agent = options[:user_agent]
|
357
|
-
elsif Web.user_agent
|
358
|
-
agent.user_agent = Web.user_agent
|
359
|
-
end
|
360
|
-
|
361
|
-
proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy)
|
362
|
-
|
363
|
-
if proxy[:host]
|
364
|
-
agent.set_proxy(
|
365
|
-
proxy[:host],
|
366
|
-
proxy[:port],
|
367
|
-
proxy[:user],
|
368
|
-
proxy[:password]
|
369
|
-
)
|
356
|
+
if options.empty?
|
357
|
+
@agent ||= Mechanize.new(options)
|
358
|
+
else
|
359
|
+
@agent = Mechanize.new(options)
|
370
360
|
end
|
371
|
-
|
372
|
-
yield agent if block_given?
|
373
|
-
return agent
|
374
361
|
end
|
375
362
|
|
376
363
|
#
|
@@ -382,12 +369,12 @@ module Ronin
|
|
382
369
|
# @param [Hash] options
|
383
370
|
# Additional options.
|
384
371
|
#
|
385
|
-
# @option options [String] :user_agent_alias
|
386
|
-
# The User-Agent Alias to use.
|
387
|
-
#
|
388
372
|
# @option options [String] :user_agent
|
389
373
|
# The User-Agent string to use.
|
390
374
|
#
|
375
|
+
# @option options [String] :user_agent_alias
|
376
|
+
# The User-Agent Alias to use.
|
377
|
+
#
|
391
378
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
392
379
|
# Proxy information.
|
393
380
|
#
|
@@ -433,12 +420,12 @@ module Ronin
|
|
433
420
|
# @param [Hash] options
|
434
421
|
# Additional options.
|
435
422
|
#
|
436
|
-
# @option options [String] :user_agent_alias
|
437
|
-
# The User-Agent Alias to use.
|
438
|
-
#
|
439
423
|
# @option options [String] :user_agent
|
440
424
|
# The User-Agent string to use.
|
441
425
|
#
|
426
|
+
# @option options [String] :user_agent_alias
|
427
|
+
# The User-Agent Alias to use.
|
428
|
+
#
|
442
429
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
443
430
|
# Proxy information.
|
444
431
|
#
|
@@ -482,12 +469,12 @@ module Ronin
|
|
482
469
|
# @option options [Hash] :query
|
483
470
|
# Additional query parameters to post with.
|
484
471
|
#
|
485
|
-
# @option options [String] :user_agent_alias
|
486
|
-
# The User-Agent Alia to use.
|
487
|
-
#
|
488
472
|
# @option options [String] :user_agent
|
489
473
|
# The User-Agent string to use.
|
490
474
|
#
|
475
|
+
# @option options [String] :user_agent_alias
|
476
|
+
# The User-Agent Alia to use.
|
477
|
+
#
|
491
478
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
492
479
|
# Proxy information.
|
493
480
|
#
|
@@ -532,12 +519,12 @@ module Ronin
|
|
532
519
|
# @option options [Hash] :query
|
533
520
|
# Additional query parameters to post with.
|
534
521
|
#
|
535
|
-
# @option options [String] :user_agent_alias
|
536
|
-
# The User-Agent Alias to use.
|
537
|
-
#
|
538
522
|
# @option options [String] :user_agent
|
539
523
|
# The User-Agent string to use.
|
540
524
|
#
|
525
|
+
# @option options [String] :user_agent_alias
|
526
|
+
# The User-Agent Alias to use.
|
527
|
+
#
|
541
528
|
# @option options [Network::HTTP::Proxy, Hash] :proxy (Web.proxy)
|
542
529
|
# Proxy information.
|
543
530
|
#
|