ronin-web-spider 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +16 -1
- data/.rubocop.yml +11 -0
- data/ChangeLog.md +20 -1
- data/Gemfile +3 -0
- data/README.md +1 -2
- data/Rakefile +2 -2
- data/gemspec.yml +1 -1
- data/lib/ronin/web/spider/agent.rb +63 -7
- data/lib/ronin/web/spider/archive.rb +2 -1
- data/lib/ronin/web/spider/exceptions.rb +1 -0
- data/lib/ronin/web/spider/git_archive.rb +2 -1
- data/lib/ronin/web/spider/version.rb +2 -1
- data/lib/ronin/web/spider.rb +63 -62
- data/ronin-web-spider.gemspec +3 -3
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
|
4
|
+
data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
|
7
|
+
data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
|
data/.github/workflows/ruby.yml
CHANGED
@@ -12,11 +12,12 @@ jobs:
|
|
12
12
|
- '3.0'
|
13
13
|
- '3.1'
|
14
14
|
- '3.2'
|
15
|
+
- '3.3'
|
15
16
|
- jruby
|
16
17
|
- truffleruby
|
17
18
|
name: Ruby ${{ matrix.ruby }}
|
18
19
|
steps:
|
19
|
-
- uses: actions/checkout@
|
20
|
+
- uses: actions/checkout@v4
|
20
21
|
- name: Set up Ruby
|
21
22
|
uses: ruby/setup-ruby@v1
|
22
23
|
with:
|
@@ -26,3 +27,17 @@ jobs:
|
|
26
27
|
run: bundle install --jobs 4 --retry 3
|
27
28
|
- name: Run tests
|
28
29
|
run: bundle exec rake test
|
30
|
+
|
31
|
+
# rubocop linting
|
32
|
+
rubocop:
|
33
|
+
runs-on: ubuntu-latest
|
34
|
+
steps:
|
35
|
+
- uses: actions/checkout@v4
|
36
|
+
- name: Set up Ruby
|
37
|
+
uses: ruby/setup-ruby@v1
|
38
|
+
with:
|
39
|
+
ruby-version: 3.0
|
40
|
+
- name: Install dependencies
|
41
|
+
run: bundle install --jobs 4 --retry 3
|
42
|
+
- name: Run rubocop
|
43
|
+
run: bundle exec rubocop --parallel
|
data/.rubocop.yml
ADDED
data/ChangeLog.md
CHANGED
@@ -1,4 +1,22 @@
|
|
1
|
-
### 0.1.
|
1
|
+
### 0.1.1 / 2024-06-19
|
2
|
+
|
3
|
+
* Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
|
4
|
+
{Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
|
5
|
+
header included `text/html` but lacked a response body, causing `page.doc` to
|
6
|
+
be `nil`.
|
7
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
|
8
|
+
JavaScript source code strings containing UTF-8 characters where being
|
9
|
+
incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
|
10
|
+
did not include a `charset=` attribute.
|
11
|
+
* Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
|
12
|
+
inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
|
13
|
+
would incorrectly be treated as the beginning or ends of JavaScript string
|
14
|
+
literals. Note that while this greatly improves the accuracy of
|
15
|
+
{Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
|
16
|
+
support parsing JavaScript template literals that may also contain string
|
17
|
+
literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
|
18
|
+
|
19
|
+
### 0.1.0 / 2023-02-01
|
2
20
|
|
3
21
|
* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
|
4
22
|
* Relicensed as LGPL-3.0.
|
@@ -20,3 +38,4 @@
|
|
20
38
|
* `every_comment` - yields every HTML or JavaScript comment.
|
21
39
|
* Supports archiving spidered pages to a directory or git repository.
|
22
40
|
|
41
|
+
[spidr]: https://github.com/postmodern/spidr#readme
|
data/Gemfile
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
source 'https://rubygems.org'
|
2
3
|
|
3
4
|
gemspec
|
@@ -28,4 +29,6 @@ group :development do
|
|
28
29
|
gem 'dead_end', require: false
|
29
30
|
gem 'sord', require: false, platform: :mri
|
30
31
|
gem 'stackprof', require: false, platform: :mri
|
32
|
+
gem 'rubocop', require: false, platform: :mri
|
33
|
+
gem 'rubocop-ronin', require: false, platform: :mri
|
31
34
|
end
|
data/README.md
CHANGED
@@ -9,7 +9,6 @@
|
|
9
9
|
* [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
|
10
10
|
* [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
|
11
11
|
* [Discord](https://discord.gg/6WAb3PsVX9) |
|
12
|
-
[Twitter](https://twitter.com/ronin_rb) |
|
13
12
|
[Mastodon](https://infosec.exchange/@ronin_rb)
|
14
13
|
|
15
14
|
## Description
|
@@ -38,7 +37,7 @@ ronin-web-spider is a collection of common web spidering routines using the
|
|
38
37
|
* [every_comment][docs-every_comment] - yields every HTML or JavaScript
|
39
38
|
comment.
|
40
39
|
* Supports archiving spidered pages to a directory or git repository.
|
41
|
-
* Has
|
40
|
+
* Has 97% documentation coverage.
|
42
41
|
* Has 94% test coverage.
|
43
42
|
|
44
43
|
[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
|
data/Rakefile
CHANGED
data/gemspec.yml
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -237,6 +238,8 @@ module Ronin
|
|
237
238
|
#
|
238
239
|
def every_html_comment
|
239
240
|
every_html_page do |page|
|
241
|
+
next unless page.doc
|
242
|
+
|
240
243
|
page.doc.xpath('//comment()').each do |comment|
|
241
244
|
comment_text = comment.inner_text.strip
|
242
245
|
|
@@ -267,20 +270,60 @@ module Ronin
|
|
267
270
|
# yield inner text of every `<script type="text/javascript">` tag
|
268
271
|
# and every `.js` URL.
|
269
272
|
every_html_page do |page|
|
273
|
+
next unless page.doc
|
274
|
+
|
270
275
|
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
271
|
-
|
272
|
-
|
276
|
+
source = script.inner_text
|
277
|
+
source.force_encoding(Encoding::UTF_8)
|
278
|
+
|
279
|
+
unless source.empty?
|
280
|
+
yield source
|
273
281
|
end
|
274
282
|
end
|
275
283
|
end
|
276
284
|
|
277
285
|
every_javascript_page do |page|
|
278
|
-
|
286
|
+
source = page.body
|
287
|
+
source.force_encoding(Encoding::UTF_8)
|
288
|
+
|
289
|
+
yield source
|
279
290
|
end
|
280
291
|
end
|
281
292
|
|
282
293
|
alias every_js every_javascript
|
283
294
|
|
295
|
+
# Regex to match and skip JavaScript inline regexes.
|
296
|
+
#
|
297
|
+
# @api private
|
298
|
+
#
|
299
|
+
# @since 0.1.1
|
300
|
+
JAVASCRIPT_INLINE_REGEX = %r{
|
301
|
+
(?# match before the regex to avoid matching division operators )
|
302
|
+
(?:[\{\[\(;:,]\s*|=\s*)
|
303
|
+
/
|
304
|
+
(?# inline regex contents )
|
305
|
+
(?:
|
306
|
+
\[ (?:\\. | [^\]]) \] (?# [...] ) |
|
307
|
+
\\. (?# backslash escaped characters ) |
|
308
|
+
[^/] (?# everything else )
|
309
|
+
)+
|
310
|
+
/[dgimsuvy]* (?# also match any regex flags )
|
311
|
+
}mx
|
312
|
+
|
313
|
+
# Regex to match and skip JavaScript template literals.
|
314
|
+
#
|
315
|
+
# @note
|
316
|
+
# This regex will not properly match nested template literals:
|
317
|
+
#
|
318
|
+
# ```javascript
|
319
|
+
# `foo ${`bar ${1+1}`}`
|
320
|
+
# ```
|
321
|
+
#
|
322
|
+
# @api private
|
323
|
+
#
|
324
|
+
# @since 0.1.1
|
325
|
+
JAVASCRIPT_TEMPLATE_LITERAL = /`(?:\\`|[^`])+`/m
|
326
|
+
|
284
327
|
#
|
285
328
|
# Passes every JavaScript string value to the given block.
|
286
329
|
#
|
@@ -293,15 +336,28 @@ module Ronin
|
|
293
336
|
#
|
294
337
|
# @example
|
295
338
|
# spider.every_javascript_string do |str|
|
296
|
-
#
|
297
|
-
#
|
339
|
+
# puts str
|
340
|
+
# end
|
298
341
|
#
|
299
342
|
# @api public
|
300
343
|
#
|
301
344
|
def every_javascript_string
|
302
345
|
every_javascript do |js|
|
303
|
-
|
304
|
-
|
346
|
+
scanner = StringScanner.new(js)
|
347
|
+
|
348
|
+
until scanner.eos?
|
349
|
+
# NOTE: this is a naive JavaScript string scanner and should
|
350
|
+
# eventually be replaced with a real JavaScript lexer or parser.
|
351
|
+
case scanner.peek(1)
|
352
|
+
when '"', "'" # beginning of a quoted string
|
353
|
+
js_string = scanner.scan(Support::Text::Patterns::STRING)
|
354
|
+
|
355
|
+
yield Support::Encoding::JS.unquote(js_string)
|
356
|
+
else
|
357
|
+
scanner.skip(JAVASCRIPT_INLINE_REGEX) ||
|
358
|
+
scanner.skip(JAVASCRIPT_TEMPLATE_LITERAL) ||
|
359
|
+
scanner.getch
|
360
|
+
end
|
305
361
|
end
|
306
362
|
end
|
307
363
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -31,7 +32,7 @@ module Ronin
|
|
31
32
|
#
|
32
33
|
# require 'ronin/web/spider'
|
33
34
|
# require 'ronin/web/spider/archive'
|
34
|
-
#
|
35
|
+
#
|
35
36
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
36
37
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
37
38
|
# archive.write(page.url,page.body)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -33,7 +34,7 @@ module Ronin
|
|
33
34
|
# require 'ronin/web/spider'
|
34
35
|
# require 'ronin/web/spider/git_archive'
|
35
36
|
# require 'date'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
39
|
# archive.commit("Updated #{Date.today}") do
|
39
40
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -21,7 +22,7 @@ module Ronin
|
|
21
22
|
module Web
|
22
23
|
module Spider
|
23
24
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.1.
|
25
|
+
VERSION = '0.1.1'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/lib/ronin/web/spider.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
#
|
2
3
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
4
|
#
|
@@ -30,136 +31,136 @@ module Ronin
|
|
30
31
|
# ## Examples
|
31
32
|
#
|
32
33
|
# Spider a host:
|
33
|
-
#
|
34
|
+
#
|
34
35
|
# ```ruby
|
35
36
|
# require 'ronin/web/spider'
|
36
|
-
#
|
37
|
+
#
|
37
38
|
# Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
38
39
|
# # ...
|
39
40
|
# end
|
40
41
|
# ```
|
41
|
-
#
|
42
|
+
#
|
42
43
|
# Spider a host:
|
43
|
-
#
|
44
|
+
#
|
44
45
|
# ```ruby
|
45
46
|
# Ronin::Web::Spider.host('solnic.eu') do |agent|
|
46
47
|
# # ...
|
47
48
|
# end
|
48
49
|
# ```
|
49
|
-
#
|
50
|
+
#
|
50
51
|
# Spider a domain (and any sub-domains):
|
51
|
-
#
|
52
|
+
#
|
52
53
|
# ```ruby
|
53
54
|
# Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
54
55
|
# # ...
|
55
56
|
# end
|
56
57
|
# ```
|
57
|
-
#
|
58
|
+
#
|
58
59
|
# Spider a site:
|
59
|
-
#
|
60
|
+
#
|
60
61
|
# ```ruby
|
61
62
|
# Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
62
63
|
# # ...
|
63
64
|
# end
|
64
65
|
# ```
|
65
|
-
#
|
66
|
+
#
|
66
67
|
# Spider multiple hosts:
|
67
|
-
#
|
68
|
+
#
|
68
69
|
# ```ruby
|
69
70
|
# Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
70
71
|
# # ...
|
71
72
|
# end
|
72
73
|
# ```
|
73
|
-
#
|
74
|
+
#
|
74
75
|
# Do not spider certain links:
|
75
|
-
#
|
76
|
+
#
|
76
77
|
# ```ruby
|
77
78
|
# Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
78
79
|
# # ...
|
79
80
|
# end
|
80
81
|
# ```
|
81
|
-
#
|
82
|
+
#
|
82
83
|
# Do not spider links on certain ports:
|
83
|
-
#
|
84
|
+
#
|
84
85
|
# ```ruby
|
85
86
|
# Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
86
87
|
# # ...
|
87
88
|
# end
|
88
89
|
# ```
|
89
|
-
#
|
90
|
+
#
|
90
91
|
# Do not spider links blacklisted in robots.txt:
|
91
|
-
#
|
92
|
+
#
|
92
93
|
# ```ruby
|
93
94
|
# Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
94
95
|
# # ...
|
95
96
|
# end
|
96
97
|
# ```
|
97
|
-
#
|
98
|
+
#
|
98
99
|
# Print out visited URLs:
|
99
|
-
#
|
100
|
+
#
|
100
101
|
# ```ruby
|
101
102
|
# Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
102
103
|
# spider.every_url { |url| puts url }
|
103
104
|
# end
|
104
105
|
# ```
|
105
|
-
#
|
106
|
+
#
|
106
107
|
# Build a URL map of a site:
|
107
|
-
#
|
108
|
+
#
|
108
109
|
# ```ruby
|
109
110
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
110
|
-
#
|
111
|
+
#
|
111
112
|
# Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
112
113
|
# spider.every_link do |origin,dest|
|
113
114
|
# url_map[dest] << origin
|
114
115
|
# end
|
115
116
|
# end
|
116
117
|
# ```
|
117
|
-
#
|
118
|
+
#
|
118
119
|
# Print out the URLs that could not be requested:
|
119
|
-
#
|
120
|
+
#
|
120
121
|
# ```ruby
|
121
122
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
122
123
|
# spider.every_failed_url { |url| puts url }
|
123
124
|
# end
|
124
125
|
# ```
|
125
|
-
#
|
126
|
+
#
|
126
127
|
# Finds all pages which have broken links:
|
127
|
-
#
|
128
|
+
#
|
128
129
|
# ```ruby
|
129
130
|
# url_map = Hash.new { |hash,key| hash[key] = [] }
|
130
|
-
#
|
131
|
+
#
|
131
132
|
# spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
132
133
|
# spider.every_link do |origin,dest|
|
133
134
|
# url_map[dest] << origin
|
134
135
|
# end
|
135
136
|
# end
|
136
|
-
#
|
137
|
+
#
|
137
138
|
# spider.failures.each do |url|
|
138
139
|
# puts "Broken link #{url} found in:"
|
139
|
-
#
|
140
|
+
#
|
140
141
|
# url_map[url].each { |page| puts " #{page}" }
|
141
142
|
# end
|
142
143
|
# ```
|
143
|
-
#
|
144
|
+
#
|
144
145
|
# Search HTML and XML pages:
|
145
|
-
#
|
146
|
+
#
|
146
147
|
# ```ruby
|
147
148
|
# Ronin::Web::Spider.site('http://company.com/') do |spider|
|
148
149
|
# spider.every_page do |page|
|
149
150
|
# puts ">>> #{page.url}"
|
150
|
-
#
|
151
|
+
#
|
151
152
|
# page.search('//meta').each do |meta|
|
152
153
|
# name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
153
154
|
# value = meta.attributes['content']
|
154
|
-
#
|
155
|
+
#
|
155
156
|
# puts " #{name} = #{value}"
|
156
157
|
# end
|
157
158
|
# end
|
158
159
|
# end
|
159
160
|
# ```
|
160
|
-
#
|
161
|
+
#
|
161
162
|
# Print out the titles from every page:
|
162
|
-
#
|
163
|
+
#
|
163
164
|
# ```ruby
|
164
165
|
# Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
165
166
|
# spider.every_html_page do |page|
|
@@ -167,9 +168,9 @@ module Ronin
|
|
167
168
|
# end
|
168
169
|
# end
|
169
170
|
# ```
|
170
|
-
#
|
171
|
+
#
|
171
172
|
# Print out every HTTP redirect:
|
172
|
-
#
|
173
|
+
#
|
173
174
|
# ```ruby
|
174
175
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
175
176
|
# spider.every_redirect_page do |page|
|
@@ -177,21 +178,21 @@ module Ronin
|
|
177
178
|
# end
|
178
179
|
# end
|
179
180
|
# ```
|
180
|
-
#
|
181
|
+
#
|
181
182
|
# Find what kinds of web servers a host is using, by accessing the headers:
|
182
|
-
#
|
183
|
+
#
|
183
184
|
# ```ruby
|
184
185
|
# servers = Set[]
|
185
|
-
#
|
186
|
+
#
|
186
187
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
187
188
|
# spider.all_headers do |headers|
|
188
189
|
# servers << headers['server']
|
189
190
|
# end
|
190
191
|
# end
|
191
192
|
# ```
|
192
|
-
#
|
193
|
+
#
|
193
194
|
# Pause the spider on a forbidden page:
|
194
|
-
#
|
195
|
+
#
|
195
196
|
# ```ruby
|
196
197
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
197
198
|
# spider.every_forbidden_page do |page|
|
@@ -199,9 +200,9 @@ module Ronin
|
|
199
200
|
# end
|
200
201
|
# end
|
201
202
|
# ```
|
202
|
-
#
|
203
|
+
#
|
203
204
|
# Skip the processing of a page:
|
204
|
-
#
|
205
|
+
#
|
205
206
|
# ```ruby
|
206
207
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
207
208
|
# spider.every_missing_page do |page|
|
@@ -209,9 +210,9 @@ module Ronin
|
|
209
210
|
# end
|
210
211
|
# end
|
211
212
|
# ```
|
212
|
-
#
|
213
|
+
#
|
213
214
|
# Skip the processing of links:
|
214
|
-
#
|
215
|
+
#
|
215
216
|
# ```ruby
|
216
217
|
# Ronin::Web::Spider.host('company.com') do |spider|
|
217
218
|
# spider.every_url do |url|
|
@@ -221,9 +222,9 @@ module Ronin
|
|
221
222
|
# end
|
222
223
|
# end
|
223
224
|
# ```
|
224
|
-
#
|
225
|
+
#
|
225
226
|
# Detect when a new host name is spidered:
|
226
|
-
#
|
227
|
+
#
|
227
228
|
# ```ruby
|
228
229
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
229
230
|
# spider.every_host do |host|
|
@@ -231,9 +232,9 @@ module Ronin
|
|
231
232
|
# end
|
232
233
|
# end
|
233
234
|
# ```
|
234
|
-
#
|
235
|
+
#
|
235
236
|
# Detect when a new SSL/TLS certificate is encountered:
|
236
|
-
#
|
237
|
+
#
|
237
238
|
# ```ruby
|
238
239
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
239
240
|
# spider.every_cert do |cert|
|
@@ -241,9 +242,9 @@ module Ronin
|
|
241
242
|
# end
|
242
243
|
# end
|
243
244
|
# ```
|
244
|
-
#
|
245
|
+
#
|
245
246
|
# Print the MD5 checksum of every `favicon.ico` file:
|
246
|
-
#
|
247
|
+
#
|
247
248
|
# ```ruby
|
248
249
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
249
250
|
# spider.every_favicon do |page|
|
@@ -251,9 +252,9 @@ module Ronin
|
|
251
252
|
# end
|
252
253
|
# end
|
253
254
|
# ```
|
254
|
-
#
|
255
|
+
#
|
255
256
|
# Print every HTML comment:
|
256
|
-
#
|
257
|
+
#
|
257
258
|
# ```ruby
|
258
259
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
259
260
|
# spider.every_html_comment do |comment|
|
@@ -261,9 +262,9 @@ module Ronin
|
|
261
262
|
# end
|
262
263
|
# end
|
263
264
|
# ```
|
264
|
-
#
|
265
|
+
#
|
265
266
|
# Print all JavaScript source code:
|
266
|
-
#
|
267
|
+
#
|
267
268
|
# ```ruby
|
268
269
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
269
270
|
# spider.every_javascript do |js|
|
@@ -271,9 +272,9 @@ module Ronin
|
|
271
272
|
# end
|
272
273
|
# end
|
273
274
|
# ```
|
274
|
-
#
|
275
|
+
#
|
275
276
|
# Print every JavaScript string literal:
|
276
|
-
#
|
277
|
+
#
|
277
278
|
# ```ruby
|
278
279
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
279
280
|
# spider.every_javascript_string do |str|
|
@@ -281,9 +282,9 @@ module Ronin
|
|
281
282
|
# end
|
282
283
|
# end
|
283
284
|
# ```
|
284
|
-
#
|
285
|
+
#
|
285
286
|
# Print every JavaScript comment:
|
286
|
-
#
|
287
|
+
#
|
287
288
|
# ```ruby
|
288
289
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
289
290
|
# spider.every_javascript_comment do |comment|
|
@@ -291,9 +292,9 @@ module Ronin
|
|
291
292
|
# end
|
292
293
|
# end
|
293
294
|
# ```
|
294
|
-
#
|
295
|
+
#
|
295
296
|
# Print every HTML and JavaScript comment:
|
296
|
-
#
|
297
|
+
#
|
297
298
|
# ```ruby
|
298
299
|
# Ronin::Web::Spider.domain('example.com') do |spider|
|
299
300
|
# spider.every_comment do |comment|
|
@@ -301,7 +302,7 @@ module Ronin
|
|
301
302
|
# end
|
302
303
|
# end
|
303
304
|
# ```
|
304
|
-
#
|
305
|
+
#
|
305
306
|
module Spider
|
306
307
|
#
|
307
308
|
# Creates a new agent and begin spidering at the given URL.
|
data/ronin-web-spider.gemspec
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'yaml'
|
4
4
|
|
@@ -22,7 +22,7 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.homepage = gemspec['homepage']
|
23
23
|
gem.metadata = gemspec['metadata'] if gemspec['metadata']
|
24
24
|
|
25
|
-
glob =
|
25
|
+
glob = ->(patterns) { gem.files & Dir[*patterns] }
|
26
26
|
|
27
27
|
gem.files = `git ls-files`.split($/)
|
28
28
|
gem.files = glob[gemspec['files']] if gemspec['files']
|
@@ -46,7 +46,7 @@ Gem::Specification.new do |gem|
|
|
46
46
|
gem.required_rubygems_version = gemspec['required_rubygems_version']
|
47
47
|
gem.post_install_message = gemspec['post_install_message']
|
48
48
|
|
49
|
-
split =
|
49
|
+
split = ->(string) { string.split(/,\s*/) }
|
50
50
|
|
51
51
|
if gemspec['dependencies']
|
52
52
|
gemspec['dependencies'].each do |name,versions|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ronin-web-spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: spidr
|
@@ -66,6 +66,7 @@ files:
|
|
66
66
|
- ".github/workflows/ruby.yml"
|
67
67
|
- ".gitignore"
|
68
68
|
- ".rspec"
|
69
|
+
- ".rubocop.yml"
|
69
70
|
- ".ruby-version"
|
70
71
|
- ".yardopts"
|
71
72
|
- COPYING.txt
|
@@ -105,8 +106,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
106
|
- !ruby/object:Gem::Version
|
106
107
|
version: '0'
|
107
108
|
requirements: []
|
108
|
-
rubygems_version: 3.3.
|
109
|
+
rubygems_version: 3.3.27
|
109
110
|
signing_key:
|
110
111
|
specification_version: 4
|
111
|
-
summary: collection of common web spidering routines
|
112
|
+
summary: A collection of common web spidering routines.
|
112
113
|
test_files: []
|