ronin-web-spider 0.1.0.beta2 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +17 -5
- data/.rubocop.yml +11 -0
- data/.yardopts +1 -1
- data/ChangeLog.md +23 -1
- data/Gemfile +3 -0
- data/README.md +303 -32
- data/Rakefile +2 -2
- data/gemspec.yml +4 -4
- data/lib/ronin/web/spider/agent.rb +123 -7
- data/lib/ronin/web/spider/archive.rb +4 -0
- data/lib/ronin/web/spider/exceptions.rb +2 -1
- data/lib/ronin/web/spider/git_archive.rb +3 -2
- data/lib/ronin/web/spider/version.rb +3 -2
- data/lib/ronin/web/spider.rb +290 -1
- data/ronin-web-spider.gemspec +5 -4
- metadata +10 -19
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -122,6 +123,8 @@ module Ronin
|
|
122
123
|
# The visited host names.
|
123
124
|
#
|
124
125
|
# @return [Set<String>, nil]
|
126
|
+
#
|
127
|
+
# @api public
|
125
128
|
attr_reader :visited_hosts
|
126
129
|
|
127
130
|
#
|
@@ -132,6 +135,13 @@ module Ronin
|
|
132
135
|
#
|
133
136
|
# @yieldparam [String] host
|
134
137
|
#
|
138
|
+
# @example
|
139
|
+
# spider.every_host do |host|
|
140
|
+
# puts "Spidring #{host} ..."
|
141
|
+
# end
|
142
|
+
#
|
143
|
+
# @api public
|
144
|
+
#
|
135
145
|
def every_host
|
136
146
|
@visited_hosts ||= Set.new
|
137
147
|
|
@@ -147,6 +157,8 @@ module Ronin
|
|
147
157
|
# All certificates encountered while spidering.
|
148
158
|
#
|
149
159
|
# @return [Array<Ronin::Support::Crypto::Cert>]
|
160
|
+
#
|
161
|
+
# @api public
|
150
162
|
attr_reader :collected_certs
|
151
163
|
|
152
164
|
#
|
@@ -157,6 +169,13 @@ module Ronin
|
|
157
169
|
#
|
158
170
|
# @yieldparam [Ronin::Support::Crypto::Cert]
|
159
171
|
#
|
172
|
+
# @example
|
173
|
+
# spider.every_cert do |cert|
|
174
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
175
|
+
# end
|
176
|
+
#
|
177
|
+
# @api public
|
178
|
+
#
|
160
179
|
def every_cert
|
161
180
|
@collected_certs ||= []
|
162
181
|
|
@@ -185,8 +204,15 @@ module Ronin
|
|
185
204
|
# @yieldparam [Spidr::Page] favicon
|
186
205
|
# An encountered `.ico` file.
|
187
206
|
#
|
207
|
+
# @example
|
208
|
+
# spider.every_favicon do |page|
|
209
|
+
# # ...
|
210
|
+
# end
|
211
|
+
#
|
188
212
|
# @see https://rubydoc.info/gems/spidr/Spidr/Page
|
189
213
|
#
|
214
|
+
# @api public
|
215
|
+
#
|
190
216
|
def every_favicon
|
191
217
|
every_page do |page|
|
192
218
|
yield page if page.icon?
|
@@ -197,14 +223,23 @@ module Ronin
|
|
197
223
|
# Passes every non-empty HTML comment to the given block.
|
198
224
|
#
|
199
225
|
# @yield [comment]
|
200
|
-
# The given block will be
|
226
|
+
# The given block will be pass every HTML comment.
|
201
227
|
#
|
202
228
|
# @yieldparam [String] comment
|
203
229
|
# The HTML comment inner text, with leading and trailing whitespace
|
204
230
|
# stripped.
|
205
231
|
#
|
232
|
+
# @example
|
233
|
+
# spider.every_html_comment do |comment|
|
234
|
+
# puts comment
|
235
|
+
# end
|
236
|
+
#
|
237
|
+
# @api public
|
238
|
+
#
|
206
239
|
def every_html_comment
|
207
240
|
every_html_page do |page|
|
241
|
+
next unless page.doc
|
242
|
+
|
208
243
|
page.doc.xpath('//comment()').each do |comment|
|
209
244
|
comment_text = comment.inner_text.strip
|
210
245
|
|
@@ -224,24 +259,71 @@ module Ronin
|
|
224
259
|
# @yieldparam [String] js
|
225
260
|
# The JavaScript source code.
|
226
261
|
#
|
262
|
+
# @example
|
263
|
+
# spider.every_javascript do |js|
|
264
|
+
# puts js
|
265
|
+
# end
|
266
|
+
#
|
267
|
+
# @api public
|
268
|
+
#
|
227
269
|
def every_javascript
|
228
270
|
# yield inner text of every `<script type="text/javascript">` tag
|
229
271
|
# and every `.js` URL.
|
230
272
|
every_html_page do |page|
|
273
|
+
next unless page.doc
|
274
|
+
|
231
275
|
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
232
|
-
|
233
|
-
|
276
|
+
source = script.inner_text
|
277
|
+
source.force_encoding(Encoding::UTF_8)
|
278
|
+
|
279
|
+
unless source.empty?
|
280
|
+
yield source
|
234
281
|
end
|
235
282
|
end
|
236
283
|
end
|
237
284
|
|
238
285
|
every_javascript_page do |page|
|
239
|
-
|
286
|
+
source = page.body
|
287
|
+
source.force_encoding(Encoding::UTF_8)
|
288
|
+
|
289
|
+
yield source
|
240
290
|
end
|
241
291
|
end
|
242
292
|
|
243
293
|
alias every_js every_javascript
|
244
294
|
|
295
|
+
# Regex to match and skip JavaScript inline regexes.
|
296
|
+
#
|
297
|
+
# @api private
|
298
|
+
#
|
299
|
+
# @since 0.1.1
|
300
|
+
JAVASCRIPT_INLINE_REGEX = %r{
|
301
|
+
(?# match before the regex to avoid matching division operators )
|
302
|
+
(?:[\{\[\(;:,]\s*|=\s*)
|
303
|
+
/
|
304
|
+
(?# inline regex contents )
|
305
|
+
(?:
|
306
|
+
\[ (?:\\. | [^\]]) \] (?# [...] ) |
|
307
|
+
\\. (?# backslash escaped characters ) |
|
308
|
+
[^/] (?# everything else )
|
309
|
+
)+
|
310
|
+
/[dgimsuvy]* (?# also match any regex flags )
|
311
|
+
}mx
|
312
|
+
|
313
|
+
# Regex to match and skip JavaScript template literals.
|
314
|
+
#
|
315
|
+
# @note
|
316
|
+
# This regex will not properly match nested template literals:
|
317
|
+
#
|
318
|
+
# ```javascript
|
319
|
+
# `foo ${`bar ${1+1}`}`
|
320
|
+
# ```
|
321
|
+
#
|
322
|
+
# @api private
|
323
|
+
#
|
324
|
+
# @since 0.1.1
|
325
|
+
JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
|
326
|
+
|
245
327
|
#
|
246
328
|
# Passes every JavaScript string value to the given block.
|
247
329
|
#
|
@@ -252,10 +334,30 @@ module Ronin
|
|
252
334
|
# @yieldparam [String] string
|
253
335
|
# The parsed contents of a JavaScript string.
|
254
336
|
#
|
337
|
+
# @example
|
338
|
+
# spider.every_javascript_string do |str|
|
339
|
+
# puts str
|
340
|
+
# end
|
341
|
+
#
|
342
|
+
# @api public
|
343
|
+
#
|
255
344
|
def every_javascript_string
|
256
345
|
every_javascript do |js|
|
257
|
-
|
258
|
-
|
346
|
+
scanner = StringScanner.new(js)
|
347
|
+
|
348
|
+
until scanner.eos?
|
349
|
+
# NOTE: this is a naive JavaScript string scanner and should
|
350
|
+
# eventually be replaced with a real JavaScript lexer or parser.
|
351
|
+
case scanner.peek(1)
|
352
|
+
when '"', "'" # beginning of a quoted string
|
353
|
+
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
354
|
+
|
355
|
+
yield Support::Encoding::JS.unquote(js_string)
|
356
|
+
else
|
357
|
+
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
358
|
+
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
359
|
+
scanner.getch
|
360
|
+
end
|
259
361
|
end
|
260
362
|
end
|
261
363
|
end
|
@@ -271,6 +373,13 @@ module Ronin
|
|
271
373
|
# @yieldparam [String] comment
|
272
374
|
# The contents of a JavaScript comment.
|
273
375
|
#
|
376
|
+
# @example
|
377
|
+
# spider.every_javascript_comment do |comment|
|
378
|
+
# puts comment
|
379
|
+
# end
|
380
|
+
#
|
381
|
+
# @api public
|
382
|
+
#
|
274
383
|
def every_javascript_comment(&block)
|
275
384
|
every_javascript do |js|
|
276
385
|
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
|
@@ -288,9 +397,16 @@ module Ronin
|
|
288
397
|
# @yieldparam [String] comment
|
289
398
|
# The contents of a HTML or JavaScript comment.
|
290
399
|
#
|
400
|
+
# @example
|
401
|
+
# spider.every_comment do |comment|
|
402
|
+
# puts comment
|
403
|
+
# end
|
404
|
+
#
|
291
405
|
# @see #every_html_comment
|
292
406
|
# @see #every_javascript_comment
|
293
407
|
#
|
408
|
+
# @api public
|
409
|
+
#
|
294
410
|
def every_comment(&block)
|
295
411
|
every_html_comment(&block)
|
296
412
|
every_javascript_comment(&block)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -29,6 +30,9 @@ module Ronin
|
|
29
30
|
#
|
30
31
|
# Spider a host and archive every web page:
|
31
32
|
#
|
33
|
+
# require 'ronin/web/spider'
|
34
|
+
# require 'ronin/web/spider/archive'
|
35
|
+
#
|
32
36
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
33
37
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
34
38
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -30,10 +31,10 @@ module Ronin
|
|
30
31
|
#
|
31
32
|
# Spider a host and archive every web page to a Git repository:
|
32
33
|
#
|
33
|
-
# require 'ronin/web/spider/git_archive'
|
34
34
|
# require 'ronin/web/spider'
|
35
|
+
# require 'ronin/web/spider/git_archive'
|
35
36
|
# require 'date'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
39
|
# archive.commit("Updated #{Date.today}") do
|
39
40
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +22,7 @@ module Ronin
|
|
21
22
|
module Web
|
22
23
|
module Spider
|
23
24
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.1.
|
25
|
+
VERSION = '0.1.1'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
4
|
-
# Copyright (c) 2006-
|
5
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
6
|
#
|
6
7
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
8
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -22,6 +23,286 @@ require 'ronin/web/spider/version'
|
|
22
23
|
|
23
24
|
module Ronin
|
24
25
|
module Web
|
26
|
+
#
|
27
|
+
# A collection of common web spidering routines using the [spidr] gem.
|
28
|
+
#
|
29
|
+
# [spidr]: https://github.com/postmodern/spidr#readme
|
30
|
+
#
|
31
|
+
# ## Examples
|
32
|
+
#
|
33
|
+
# Spider a host:
|
34
|
+
#
|
35
|
+
# ```ruby
|
36
|
+
# require 'ronin/web/spider'
|
37
|
+
#
|
38
|
+
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
39
|
+
# # ...
|
40
|
+
# end
|
41
|
+
# ```
|
42
|
+
#
|
43
|
+
# Spider a host:
|
44
|
+
#
|
45
|
+
# ```ruby
|
46
|
+
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
47
|
+
# # ...
|
48
|
+
# end
|
49
|
+
# ```
|
50
|
+
#
|
51
|
+
# Spider a domain (and any sub-domains):
|
52
|
+
#
|
53
|
+
# ```ruby
|
54
|
+
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
55
|
+
# # ...
|
56
|
+
# end
|
57
|
+
# ```
|
58
|
+
#
|
59
|
+
# Spider a site:
|
60
|
+
#
|
61
|
+
# ```ruby
|
62
|
+
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
63
|
+
# # ...
|
64
|
+
# end
|
65
|
+
# ```
|
66
|
+
#
|
67
|
+
# Spider multiple hosts:
|
68
|
+
#
|
69
|
+
# ```ruby
|
70
|
+
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
71
|
+
# # ...
|
72
|
+
# end
|
73
|
+
# ```
|
74
|
+
#
|
75
|
+
# Do not spider certain links:
|
76
|
+
#
|
77
|
+
# ```ruby
|
78
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
79
|
+
# # ...
|
80
|
+
# end
|
81
|
+
# ```
|
82
|
+
#
|
83
|
+
# Do not spider links on certain ports:
|
84
|
+
#
|
85
|
+
# ```ruby
|
86
|
+
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
87
|
+
# # ...
|
88
|
+
# end
|
89
|
+
# ```
|
90
|
+
#
|
91
|
+
# Do not spider links blacklisted in robots.txt:
|
92
|
+
#
|
93
|
+
# ```ruby
|
94
|
+
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
95
|
+
# # ...
|
96
|
+
# end
|
97
|
+
# ```
|
98
|
+
#
|
99
|
+
# Print out visited URLs:
|
100
|
+
#
|
101
|
+
# ```ruby
|
102
|
+
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
103
|
+
# spider.every_url { |url| puts url }
|
104
|
+
# end
|
105
|
+
# ```
|
106
|
+
#
|
107
|
+
# Build a URL map of a site:
|
108
|
+
#
|
109
|
+
# ```ruby
|
110
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
111
|
+
#
|
112
|
+
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
113
|
+
# spider.every_link do |origin,dest|
|
114
|
+
# url_map[dest] << origin
|
115
|
+
# end
|
116
|
+
# end
|
117
|
+
# ```
|
118
|
+
#
|
119
|
+
# Print out the URLs that could not be requested:
|
120
|
+
#
|
121
|
+
# ```ruby
|
122
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
123
|
+
# spider.every_failed_url { |url| puts url }
|
124
|
+
# end
|
125
|
+
# ```
|
126
|
+
#
|
127
|
+
# Finds all pages which have broken links:
|
128
|
+
#
|
129
|
+
# ```ruby
|
130
|
+
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
131
|
+
#
|
132
|
+
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
133
|
+
# spider.every_link do |origin,dest|
|
134
|
+
# url_map[dest] << origin
|
135
|
+
# end
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
# spider.failures.each do |url|
|
139
|
+
# puts "Broken link #{url} found in:"
|
140
|
+
#
|
141
|
+
# url_map[url].each { |page| puts " #{page}" }
|
142
|
+
# end
|
143
|
+
# ```
|
144
|
+
#
|
145
|
+
# Search HTML and XML pages:
|
146
|
+
#
|
147
|
+
# ```ruby
|
148
|
+
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
149
|
+
# spider.every_page do |page|
|
150
|
+
# puts ">>> #{page.url}"
|
151
|
+
#
|
152
|
+
# page.search('//meta').each do |meta|
|
153
|
+
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
154
|
+
# value = meta.attributes['content']
|
155
|
+
#
|
156
|
+
# puts " #{name} = #{value}"
|
157
|
+
# end
|
158
|
+
# end
|
159
|
+
# end
|
160
|
+
# ```
|
161
|
+
#
|
162
|
+
# Print out the titles from every page:
|
163
|
+
#
|
164
|
+
# ```ruby
|
165
|
+
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
166
|
+
# spider.every_html_page do |page|
|
167
|
+
# puts page.title
|
168
|
+
# end
|
169
|
+
# end
|
170
|
+
# ```
|
171
|
+
#
|
172
|
+
# Print out every HTTP redirect:
|
173
|
+
#
|
174
|
+
# ```ruby
|
175
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
176
|
+
# spider.every_redirect_page do |page|
|
177
|
+
# puts "#{page.url} -> #{page.headers['Location']}"
|
178
|
+
# end
|
179
|
+
# end
|
180
|
+
# ```
|
181
|
+
#
|
182
|
+
# Find what kinds of web servers a host is using, by accessing the headers:
|
183
|
+
#
|
184
|
+
# ```ruby
|
185
|
+
# servers = Set[]
|
186
|
+
#
|
187
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
188
|
+
# spider.all_headers do |headers|
|
189
|
+
# servers << headers['server']
|
190
|
+
# end
|
191
|
+
# end
|
192
|
+
# ```
|
193
|
+
#
|
194
|
+
# Pause the spider on a forbidden page:
|
195
|
+
#
|
196
|
+
# ```ruby
|
197
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
198
|
+
# spider.every_forbidden_page do |page|
|
199
|
+
# spider.pause!
|
200
|
+
# end
|
201
|
+
# end
|
202
|
+
# ```
|
203
|
+
#
|
204
|
+
# Skip the processing of a page:
|
205
|
+
#
|
206
|
+
# ```ruby
|
207
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
208
|
+
# spider.every_missing_page do |page|
|
209
|
+
# spider.skip_page!
|
210
|
+
# end
|
211
|
+
# end
|
212
|
+
# ```
|
213
|
+
#
|
214
|
+
# Skip the processing of links:
|
215
|
+
#
|
216
|
+
# ```ruby
|
217
|
+
# Ronin::Web::Spider.host('company.com') do |spider|
|
218
|
+
# spider.every_url do |url|
|
219
|
+
# if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
220
|
+
# spider.skip_link!
|
221
|
+
# end
|
222
|
+
# end
|
223
|
+
# end
|
224
|
+
# ```
|
225
|
+
#
|
226
|
+
# Detect when a new host name is spidered:
|
227
|
+
#
|
228
|
+
# ```ruby
|
229
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
230
|
+
# spider.every_host do |host|
|
231
|
+
# puts "Spidring #{host} ..."
|
232
|
+
# end
|
233
|
+
# end
|
234
|
+
# ```
|
235
|
+
#
|
236
|
+
# Detect when a new SSL/TLS certificate is encountered:
|
237
|
+
#
|
238
|
+
# ```ruby
|
239
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
240
|
+
# spider.every_cert do |cert|
|
241
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
242
|
+
# end
|
243
|
+
# end
|
244
|
+
# ```
|
245
|
+
#
|
246
|
+
# Print the MD5 checksum of every `favicon.ico` file:
|
247
|
+
#
|
248
|
+
# ```ruby
|
249
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
250
|
+
# spider.every_favicon do |page|
|
251
|
+
# puts "#{page.url}: #{page.body.md5}"
|
252
|
+
# end
|
253
|
+
# end
|
254
|
+
# ```
|
255
|
+
#
|
256
|
+
# Print every HTML comment:
|
257
|
+
#
|
258
|
+
# ```ruby
|
259
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
260
|
+
# spider.every_html_comment do |comment|
|
261
|
+
# puts comment
|
262
|
+
# end
|
263
|
+
# end
|
264
|
+
# ```
|
265
|
+
#
|
266
|
+
# Print all JavaScript source code:
|
267
|
+
#
|
268
|
+
# ```ruby
|
269
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
270
|
+
# spider.every_javascript do |js|
|
271
|
+
# puts js
|
272
|
+
# end
|
273
|
+
# end
|
274
|
+
# ```
|
275
|
+
#
|
276
|
+
# Print every JavaScript string literal:
|
277
|
+
#
|
278
|
+
# ```ruby
|
279
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
280
|
+
# spider.every_javascript_string do |str|
|
281
|
+
# puts str
|
282
|
+
# end
|
283
|
+
# end
|
284
|
+
# ```
|
285
|
+
#
|
286
|
+
# Print every JavaScript comment:
|
287
|
+
#
|
288
|
+
# ```ruby
|
289
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
290
|
+
# spider.every_javascript_comment do |comment|
|
291
|
+
# puts comment
|
292
|
+
# end
|
293
|
+
# end
|
294
|
+
# ```
|
295
|
+
#
|
296
|
+
# Print every HTML and JavaScript comment:
|
297
|
+
#
|
298
|
+
# ```ruby
|
299
|
+
# Ronin::Web::Spider.domain('example.com') do |spider|
|
300
|
+
# spider.every_comment do |comment|
|
301
|
+
# puts comment
|
302
|
+
# end
|
303
|
+
# end
|
304
|
+
# ```
|
305
|
+
#
|
25
306
|
module Spider
|
26
307
|
#
|
27
308
|
# Creates a new agent and begin spidering at the given URL.
|
@@ -41,6 +322,8 @@ module Ronin
|
|
41
322
|
#
|
42
323
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#start_at-class_method
|
43
324
|
#
|
325
|
+
# @api public
|
326
|
+
#
|
44
327
|
def self.start_at(url,**kwargs,&block)
|
45
328
|
Agent.start_at(url,**kwargs,&block)
|
46
329
|
end
|
@@ -63,6 +346,8 @@ module Ronin
|
|
63
346
|
#
|
64
347
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#host-class_method
|
65
348
|
#
|
349
|
+
# @api public
|
350
|
+
#
|
66
351
|
def self.host(name,**kwargs,&block)
|
67
352
|
Agent.host(name,**kwargs,&block)
|
68
353
|
end
|
@@ -85,6 +370,8 @@ module Ronin
|
|
85
370
|
#
|
86
371
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#site-class_method
|
87
372
|
#
|
373
|
+
# @api public
|
374
|
+
#
|
88
375
|
def self.site(url,**kwargs,&block)
|
89
376
|
Agent.site(url,**kwargs,&block)
|
90
377
|
end
|
@@ -107,6 +394,8 @@ module Ronin
|
|
107
394
|
#
|
108
395
|
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#domain-class_method
|
109
396
|
#
|
397
|
+
# @api public
|
398
|
+
#
|
110
399
|
def self.domain(name,**kwargs,&block)
|
111
400
|
Agent.domain(name,**kwargs,&block)
|
112
401
|
end
|
data/ronin-web-spider.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'yaml'
|
4
4
|
|
@@ -22,18 +22,19 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.homepage = gemspec['homepage']
|
23
23
|
gem.metadata = gemspec['metadata'] if gemspec['metadata']
|
24
24
|
|
25
|
-
glob =
|
25
|
+
glob = ->(patterns) { gem.files & Dir[*patterns] }
|
26
26
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
29
29
|
gem.files += Array(gemspec['generated_files'])
|
30
|
+
# exclude test files from the packages gem
|
31
|
+
gem.files -= glob[gemspec['test_files'] || 'spec/{**/}*']
|
30
32
|
|
31
33
|
gem.executables = gemspec.fetch('executables') do
|
32
34
|
glob['bin/*'].map { |path| File.basename(path) }
|
33
35
|
end
|
34
36
|
|
35
37
|
gem.extensions = glob[gemspec['extensions'] || 'ext/**/extconf.rb']
|
36
|
-
gem.test_files = glob[gemspec['test_files'] || 'spec/{**/}*_spec.rb']
|
37
38
|
gem.extra_rdoc_files = glob[gemspec['extra_doc_files'] || '*.{txt,md}']
|
38
39
|
|
39
40
|
gem.require_paths = Array(gemspec.fetch('require_paths') {
|
@@ -45,7 +46,7 @@ Gem::Specification.new do |gem|
|
|
45
46
|
gem.required_rubygems_version = gemspec['required_rubygems_version']
|
46
47
|
gem.post_install_message = gemspec['post_install_message']
|
47
48
|
|
48
|
-
split =
|
49
|
+
split = ->(string) { string.split(/,\s*/) }
|
49
50
|
|
50
51
|
if gemspec['dependencies']
|
51
52
|
gemspec['dependencies'].each do |name,versions|
|