ronin-web-spider 0.1.0.beta2 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +17 -5
- data/.rubocop.yml +11 -0
- data/.yardopts +1 -1
- data/ChangeLog.md +23 -1
- data/Gemfile +3 -0
- data/README.md +303 -32
- data/Rakefile +2 -2
- data/gemspec.yml +4 -4
- data/lib/ronin/web/spider/agent.rb +123 -7
- data/lib/ronin/web/spider/archive.rb +4 -0
- data/lib/ronin/web/spider/exceptions.rb +2 -1
- data/lib/ronin/web/spider/git_archive.rb +3 -2
- data/lib/ronin/web/spider/version.rb +3 -2
- data/lib/ronin/web/spider.rb +290 -1
- data/ronin-web-spider.gemspec +5 -4
- metadata +10 -19
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -122,6 +123,8 @@ module Ronin
|
|
122
123
|
# The visited host names.
|
123
124
|
#
|
124
125
|
# @return [Set<String>, nil]
|
126
|
+
#
|
127
|
+
# @api public
|
125
128
|
attr_reader :visited_hosts
|
126
129
|
|
127
130
|
#
|
@@ -132,6 +135,13 @@ module Ronin
|
|
132
135
|
#
|
133
136
|
# @yieldparam [String] host
|
134
137
|
#
|
138
|
+
# @example
|
139
|
+
# spider.every_host do |host|
|
140
|
+
# puts "Spidring #{host} ..."
|
141
|
+
# end
|
142
|
+
#
|
143
|
+
# @api public
|
144
|
+
#
|
135
145
|
def every_host
|
136
146
|
@visited_hosts ||= Set.new
|
137
147
|
|
@@ -147,6 +157,8 @@ module Ronin
|
|
147
157
|
# All certificates encountered while spidering.
|
148
158
|
#
|
149
159
|
# @return [Array<Ronin::Support::Crypto::Cert>]
|
160
|
+
#
|
161
|
+
# @api public
|
150
162
|
attr_reader :collected_certs
|
151
163
|
|
152
164
|
#
|
@@ -157,6 +169,13 @@ module Ronin
|
|
157
169
|
#
|
158
170
|
# @yieldparam [Ronin::Support::Crypto::Cert]
|
159
171
|
#
|
172
|
+
# @example
|
173
|
+
# spider.every_cert do |cert|
|
174
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
175
|
+
# end
|
176
|
+
#
|
177
|
+
# @api public
|
178
|
+
#
|
160
179
|
def every_cert
|
161
180
|
@collected_certs ||= []
|
162
181
|
|
@@ -185,8 +204,15 @@ module Ronin
|
|
185
204
|
# @yieldparam [Spidr::Page] favicon
|
186
205
|
# An encountered `.ico` file.
|
187
206
|
#
|
207
|
+
# @example
|
208
|
+
# spider.every_favicon do |page|
|
209
|
+
# # ...
|
210
|
+
# end
|
211
|
+
#
|
188
212
|
# @see https://rubydoc.info/gems/spidr/Spidr/Page
|
189
213
|
#
|
214
|
+
# @api public
|
215
|
+
#
|
190
216
|
def every_favicon
|
191
217
|
every_page do |page|
|
192
218
|
yield page if page.icon?
|
@@ -197,14 +223,23 @@ module Ronin
|
|
197
223
|
# Passes every non-empty HTML comment to the given block.
|
198
224
|
#
|
199
225
|
# @yield [comment]
|
200
|
-
# The given block will be
|
226
|
+
# The given block will be pass every HTML comment.
|
201
227
|
#
|
202
228
|
# @yieldparam [String] comment
|
203
229
|
# The HTML comment inner text, with leading and trailing whitespace
|
204
230
|
# stripped.
|
205
231
|
#
|
232
|
+
# @example
|
233
|
+
# spider.every_html_comment do |comment|
|
234
|
+
# puts comment
|
235
|
+
# end
|
236
|
+
#
|
237
|
+
# @api public
|
238
|
+
#
|
206
239
|
def every_html_comment
|
207
240
|
every_html_page do |page|
|
241
|
+
next unless page.doc
|
242
|
+
|
208
243
|
page.doc.xpath('//comment()').each do |comment|
|
209
244
|
comment_text = comment.inner_text.strip
|
210
245
|
|
@@ -224,24 +259,71 @@ module Ronin
|
|
224
259
|
# @yieldparam [String] js
|
225
260
|
# The JavaScript source code.
|
226
261
|
#
|
262
|
+
# @example
|
263
|
+
# spider.every_javascript do |js|
|
264
|
+
# puts js
|
265
|
+
# end
|
266
|
+
#
|
267
|
+
# @api public
|
268
|
+
#
|
227
269
|
def every_javascript
|
228
270
|
# yield inner text of every `<script type="text/javascript">` tag
|
229
271
|
# and every `.js` URL.
|
230
272
|
every_html_page do |page|
|
273
|
+
next unless page.doc
|
274
|
+
|
231
275
|
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
232
|
-
|
233
|
-
|
276
|
+
source = script.inner_text
|
277
|
+
source.force_encoding(Encoding::UTF_8)
|
278
|
+
|
279
|
+
unless source.empty?
|
280
|
+
yield source
|
234
281
|
end
|
235
282
|
end
|
236
283
|
end
|
237
284
|
|
238
285
|
every_javascript_page do |page|
|
239
|
-
|
286
|
+
source = page.body
|
287
|
+
source.force_encoding(Encoding::UTF_8)
|
288
|
+
|
289
|
+
yield source
|
240
290
|
end
|
241
291
|
end
|
242
292
|
|
243
293
|
alias every_js every_javascript
|
244
294
|
|
295
|
+
# Regex to match and skip JavaScript inline regexes.
|
296
|
+
#
|
297
|
+
# @api private
|
298
|
+
#
|
299
|
+
# @since 0.1.1
|
300
|
+
JAVASCRIPT_INLINE_REGEX = %r{
|
301
|
+
(?# match before the regex to avoid matching division operators )
|
302
|
+
(?:[\{\[\(;:,]\s*|=\s*)
|
303
|
+
/
|
304
|
+
(?# inline regex contents )
|
305
|
+
(?:
|
306
|
+
\[ (?:\\. | [^\]]) \] (?# [...] ) |
|
307
|
+
\\. (?# backslash escaped characters ) |
|
308
|
+
[^/] (?# everything else )
|
309
|
+
)+
|
310
|
+
/[dgimsuvy]* (?# also match any regex flags )
|
311
|
+
}mx
|
312
|
+
|
313
|
+
# Regex to match and skip JavaScript template literals.
|
314
|
+
#
|
315
|
+
# @note
|
316
|
+
# This regex will not properly match nested template literals:
|
317
|
+
#
|
318
|
+
# ```javascript
|
319
|
+
# `foo ${`bar ${1+1}`}`
|
320
|
+
# ```
|
321
|
+
#
|
322
|
+
# @api private
|
323
|
+
#
|
324
|
+
# @since 0.1.1
|
325
|
+
JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
|
326
|
+
|
245
327
|
#
|
246
328
|
# Passes every JavaScript string value to the given block.
|
247
329
|
#
|
@@ -252,10 +334,30 @@ module Ronin
|
|
252
334
|
# @yieldparam [String] string
|
253
335
|
# The parsed contents of a JavaScript string.
|
254
336
|
#
|
337
|
+
# @example
|
338
|
+
# spider.every_javascript_string do |str|
|
339
|
+
# puts str
|
340
|
+
# end
|
341
|
+
#
|
342
|
+
# @api public
|
343
|
+
#
|
255
344
|
def every_javascript_string
|
256
345
|
every_javascript do |js|
|
257
|
-
|
258
|
-
|
346
|
+
scanner = StringScanner.new(js)
|
347
|
+
|
348
|
+
until scanner.eos?
|
349
|
+
# NOTE: this is a naive JavaScript string scanner and should
|
350
|
+
# eventually be replaced with a real JavaScript lexer or parser.
|
351
|
+
case scanner.peek(1)
|
352
|
+
when '"', "'" # beginning of a quoted string
|
353
|
+
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
354
|
+
|
355
|
+
yield Support::Encoding::JS.unquote(js_string)
|
356
|
+
else
|
357
|
+
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
358
|
+
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
359
|
+
scanner.getch
|
360
|
+
end
|
259
361
|
end
|
260
362
|
end
|
261
363
|
end
|
@@ -271,6 +373,13 @@ module Ronin
|
|
271
373
|
# @yieldparam [String] comment
|
272
374
|
# The contents of a JavaScript comment.
|
273
375
|
#
|
376
|
+
# @example
|
377
|
+
# spider.every_javascript_comment do |comment|
|
378
|
+
# puts comment
|
379
|
+
# end
|
380
|
+
#
|
381
|
+
# @api public
|
382
|
+
#
|
274
383
|
def every_javascript_comment(&block)
|
275
384
|
every_javascript do |js|
|
276
385
|
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
|
@@ -288,9 +397,16 @@ module Ronin
|
|
288
397
|
# @yieldparam [String] comment
|
289
398
|
# The contents of a HTML or JavaScript comment.
|
290
399
|
#
|
400
|
+
# @example
|
401
|
+
# spider.every_comment do |comment|
|
402
|
+
# puts comment
|
403
|
+
# end
|
404
|
+
#
|
291
405
|
# @see #every_html_comment
|
292
406
|
# @see #every_javascript_comment
|
293
407
|
#
|
408
|
+
# @api public
|
409
|
+
#
|
294
410
|
def every_comment(&block)
|
295
411
|
every_html_comment(&block)
|
296
412
|
every_javascript_comment(&block)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -29,6 +30,9 @@ module Ronin
|
|
29
30
|
#
|
30
31
|
# Spider a host and archive every web page:
|
31
32
|
#
|
33
|
+
# require 'ronin/web/spider'
|
34
|
+
# require 'ronin/web/spider/archive'
|
35
|
+
#
|
32
36
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
33
37
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
34
38
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -30,10 +31,10 @@ module Ronin
|
|
30
31
|
#
|
31
32
|
# Spider a host and archive every web page to a Git repository:
|
32
33
|
#
|
33
|
-
# require 'ronin/web/spider/git_archive'
|
34
34
|
# require 'ronin/web/spider'
|
35
|
+
# require 'ronin/web/spider/git_archive'
|
35
36
|
# require 'date'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
39
|
# archive.commit("Updated #{Date.today}") do
|
39
40
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +22,7 @@ module Ronin
|
|
21
22
|
module Web
|
22
23
|
module Spider
|
23
24
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.1.
|
25
|
+
VERSION = '0.1.1'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +23,286 @@ require 'ronin/web/spider/version'
|
|
22
23
|
|
23
24
|
module Ronin
|
24
25
|
module Web
|
26
|
+
#
|
27
|
+
# A collection of common web spidering routines using the [spidr] gem.
|
28
|
+
#
|
29
|
+
# [spidr]: https://github.com/postmodern/spidr#readme
|
30
|
+
#
|
31
|
+
# ## Examples
|
32
|
+
#
|
33
|
+
# Spider a host:
|
34
|
+
#
|
35
|
+
# ```ruby
|
36
|
+
# require 'ronin/web/spider'
|
37
|
+
#
|
38
|
+
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
39
|
+
# # ...
|
40
|
+
# end
|
41
|
+
# ```
|
42
|
+
#
|
43
|
+
# Spider a host:
|
44
|
+
#
|
45
|
+
# ```ruby
|
46
|
+
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
47
|
+
# # ...
|
48
|
+
# end
|
49
|
+
# ```
|
50
|
+
#
|
51
|
+
# Spider a domain (and any sub-domains):
|
52
|
+
#
|
53
|
+
# ```ruby
|
54
|
+
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
55
|
+
# # ...
|
56
|
+
# end
|
57
|
+
# ```
|
58
|
+
#
|
59
|
+
# Spider a site:
|
60
|
+
#
|
61
|
+
# ```ruby
|
62
|
+
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
63
|
+
# # ...
|
64
|
+
# end
|
65
|
+
# ```
|
66
|
+
#
|
67
|
+
# Spider multiple hosts:
|
68
|
+
#
|
69
|
+
# ```ruby
|
70
|
+
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
71
|
+
# # ...
|
72
|
+
# end
|
73
|
+
# ```
|
74
|
+
#
|
75
|
+
# Do not spider certain links:
|
76
|
+
#
|
77
|
+
# ```ruby
|
78
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
79
|
+
# # ...
|
80
|
+
# end
|
81
|
+
# ```
|
82
|
+
#
|
83
|
+
# Do not spider links on certain ports:
|
84
|
+
#
|
85
|
+
# ```ruby
|
86
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
87
|
+
# # ...
|
88
|
+
# end
|
89
|
+
# ```
|
90
|
+
#
|
91
|
+
# Do not spider links blacklisted in robots.txt:
|
92
|
+
#
|
93
|
+
# ```ruby
|
94
|
+
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
95
|
+
# # ...
|
96
|
+
# end
|
97
|
+
# ```
|
98
|
+
#
|
99
|
+
# Print out visited URLs:
|
100
|
+
#
|
101
|
+
# ```ruby
|
102
|
+
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
103
|
+
# spider.every_url { |url| puts url }
|
104
|
+
# end
|
105
|
+
# ```
|
106
|
+
#
|
107
|
+
# Build a URL map of a site:
|
108
|
+
#
|
109
|
+
# ```ruby
|
110
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
111
|
+
#
|
112
|
+
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
113
|
+
# spider.every_link do |origin,dest|
|
114
|
+
# url_map[dest] << origin
|
115
|
+
# end
|
116
|
+
# end
|
117
|
+
# ```
|
118
|
+
#
|
119
|
+
# Print out the URLs that could not be requested:
|
120
|
+
#
|
121
|
+
# ```ruby
|
122
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
123
|
+
# spider.every_failed_url { |url| puts url }
|
124
|
+
# end
|
125
|
+
# ```
|
126
|
+
#
|
127
|
+
# Finds all pages which have broken links:
|
128
|
+
#
|
129
|
+
# ```ruby
|
130
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
131
|
+
#
|
132
|
+
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
133
|
+
# spider.every_link do |origin,dest|
|
134
|
+
# url_map[dest] << origin
|
135
|
+
# end
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
# spider.failures.each do |url|
|
139
|
+
# puts "Broken link #{url} found in:"
|
140
|
+
#
|
141
|
+
# url_map[url].each { |page| puts " #{page}" }
|
142
|
+
# end
|
143
|
+
# ```
|
144
|
+
#
|
145
|
+
# Search HTML and XML pages:
|
146
|
+
#
|
147
|
+
# ```ruby
|
148
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
149
|
+
# spider.every_page do |page|
|
150
|
+
# puts ">>> #{page.url}"
|
151
|
+
#
|
152
|
+
# page.search('//meta').each do |meta|
|
153
|
+
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
154
|
+
# value = meta.attributes['content']
|
155
|
+
#
|
156
|
+
# puts " #{name} = #{value}"
|
157
|
+
# end
|
158
|
+
# end
|
159
|
+
# end
|
160
|
+
# ```
|
161
|
+
#
|
162
|
+
# Print out the titles from every page:
|
163
|
+
#
|
164
|
+
# ```ruby
|
165
|
+
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
166
|
+
# spider.every_html_page do |page|
|
167
|
+
# puts page.title
|
168
|
+
# end
|
169
|
+
# end
|
170
|
+
# ```
|
171
|
+
#
|
172
|
+
# Print out every HTTP redirect:
|
173
|
+
#
|
174
|
+
# ```ruby
|
175
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
176
|
+
# spider.every_redirect_page do |page|
|
177
|
+
# puts "#{page.url} -> #{page.headers['Location']}"
|
178
|
+
# end
|
179
|
+
# end
|
180
|
+
# ```
|
181
|
+
#
|
182
|
+
# Find what kinds of web servers a host is using, by accessing the headers:
|
183
|
+
#
|
184
|
+
# ```ruby
|
185
|
+
# servers = Set[]
|
186
|
+
#
|
187
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
188
|
+
# spider.all_headers do |headers|
|
189
|
+
# servers << headers['server']
|
190
|
+
# end
|
191
|
+
# end
|
192
|
+
# ```
|
193
|
+
#
|
194
|
+
# Pause the spider on a forbidden page:
|
195
|
+
#
|
196
|
+
# ```ruby
|
197
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
198
|
+
# spider.every_forbidden_page do |page|
|
199
|
+
# spider.pause!
|
200
|
+
# end
|
201
|
+
# end
|
202
|
+
# ```
|
203
|
+
#
|
204
|
+
# Skip the processing of a page:
|
205
|
+
#
|
206
|
+
# ```ruby
|
207
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
208
|
+
# spider.every_missing_page do |page|
|
209
|
+
# spider.skip_page!
|
210
|
+
# end
|
211
|
+
# end
|
212
|
+
# ```
|
213
|
+
#
|
214
|
+
# Skip the processing of links:
|
215
|
+
#
|
216
|
+
# ```ruby
|
217
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
218
|
+
# spider.every_url do |url|
|
219
|
+
# if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
220
|
+
# spider.skip_link!
|
221
|
+
# end
|
222
|
+
# end
|
223
|
+
# end
|
224
|
+
# ```
|
225
|
+
#
|
226
|
+
# Detect when a new host name is spidered:
|
227
|
+
#
|
228
|
+
# ```ruby
|
229
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
230
|
+
# spider.every_host do |host|
|
231
|
+
# puts "Spidring #{host} ..."
|
232
|
+
# end
|
233
|
+
# end
|
234
|
+
# ```
|
235
|
+
#
|
236
|
+
# Detect when a new SSL/TLS certificate is encountered:
|
237
|
+
#
|
238
|
+
# ```ruby
|
239
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
240
|
+
# spider.every_cert do |cert|
|
241
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
242
|
+
# end
|
243
|
+
# end
|
244
|
+
# ```
|
245
|
+
#
|
246
|
+
# Print the MD5 checksum of every `favicon.ico` file:
|
247
|
+
#
|
248
|
+
# ```ruby
|
249
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
250
|
+
# spider.every_favicon do |page|
|
251
|
+
# puts "#{page.url}: #{page.body.md5}"
|
252
|
+
# end
|
253
|
+
# end
|
254
|
+
# ```
|
255
|
+
#
|
256
|
+
# Print every HTML comment:
|
257
|
+
#
|
258
|
+
# ```ruby
|
259
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
260
|
+
# spider.every_html_comment do |comment|
|
261
|
+
# puts comment
|
262
|
+
# end
|
263
|
+
# end
|
264
|
+
# ```
|
265
|
+
#
|
266
|
+
# Print all JavaScript source code:
|
267
|
+
#
|
268
|
+
# ```ruby
|
269
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
270
|
+
# spider.every_javascript do |js|
|
271
|
+
# puts js
|
272
|
+
# end
|
273
|
+
# end
|
274
|
+
# ```
|
275
|
+
#
|
276
|
+
# Print every JavaScript string literal:
|
277
|
+
#
|
278
|
+
# ```ruby
|
279
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
280
|
+
# spider.every_javascript_string do |str|
|
281
|
+
# puts str
|
282
|
+
# end
|
283
|
+
# end
|
284
|
+
# ```
|
285
|
+
#
|
286
|
+
# Print every JavaScript comment:
|
287
|
+
#
|
288
|
+
# ```ruby
|
289
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
290
|
+
# spider.every_javascript_comment do |comment|
|
291
|
+
# puts comment
|
292
|
+
# end
|
293
|
+
# end
|
294
|
+
# ```
|
295
|
+
#
|
296
|
+
# Print every HTML and JavaScript comment:
|
297
|
+
#
|
298
|
+
# ```ruby
|
299
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
300
|
+
# spider.every_comment do |comment|
|
301
|
+
# puts comment
|
302
|
+
# end
|
303
|
+
# end
|
304
|
+
# ```
|
305
|
+
#
|
25
306
|
module Spider
|
26
307
|
#
|
27
308
|
# Creates a new agent and begin spidering at the given URL.
|
@@ -41,6 +322,8 @@ module Ronin
|
|
41
322
|
#
|
42
323
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
|
43
324
|
#
|
325
|
+
# @api public
|
326
|
+
#
|
44
327
|
def self.start_at(url,**kwargs,&block)
|
45
328
|
Agent.start_at(url,**kwargs,&block)
|
46
329
|
end
|
@@ -63,6 +346,8 @@ module Ronin
|
|
63
346
|
#
|
64
347
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
|
65
348
|
#
|
349
|
+
# @api public
|
350
|
+
#
|
66
351
|
def self.host(name,**kwargs,&block)
|
67
352
|
Agent.host(name,**kwargs,&block)
|
68
353
|
end
|
@@ -85,6 +370,8 @@ module Ronin
|
|
85
370
|
#
|
86
371
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
|
87
372
|
#
|
373
|
+
# @api public
|
374
|
+
#
|
88
375
|
def self.site(url,**kwargs,&block)
|
89
376
|
Agent.site(url,**kwargs,&block)
|
90
377
|
end
|
@@ -107,6 +394,8 @@ module Ronin
|
|
107
394
|
#
|
108
395
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
|
109
396
|
#
|
397
|
+
# @api public
|
398
|
+
#
|
110
399
|
def self.domain(name,**kwargs,&block)
|
111
400
|
Agent.domain(name,**kwargs,&block)
|
112
401
|
end
|
data/ronin-web-spider.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'yaml'
|
4
4
|
|
@@ -22,18 +22,19 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.homepage = gemspec['homepage']
|
23
23
|
gem.metadata = gemspec['metadata'] if gemspec['metadata']
|
24
24
|
|
25
|
-
glob =
|
25
|
+
glob = ->(patterns) { gem.files & Dir[*patterns] }
|
26
26
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
29
29
|
gem.files += Array(gemspec['generated_files'])
|
30
|
+
# exclude test files from the packages gem
|
31
|
+
gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
|
30
32
|
|
31
33
|
gem.executables = gemspec.fetch('executables') do
|
32
34
|
glob['bin/*'].map { |path| File.basename(path) }
|
33
35
|
end
|
34
36
|
|
35
37
|
gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
|
36
|
-
gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
|
37
38
|
gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
|
38
39
|
|
39
40
|
gem.require_paths = Array(gemspec.fetch('require_paths') {
|
@@ -45,7 +46,7 @@ Gem::Specification.new do |gem|
|
|
45
46
|
gem.required_rubygems_version = gemspec['required_rubygems_version']
|
46
47
|
gem.post_install_message = gemspec['post_install_message']
|
47
48
|
|
48
|
-
split =
|
49
|
+
split = ->(string) { string.split(/,\s*/) }
|
49
50
|
|
50
51
|
if gemspec['dependencies']
|
51
52
|
gemspec['dependencies'].each do |name,versions|
|