ronin-web-spider 0.1.0.beta2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe9c4af84eeeb8d8c8c46e8f1c0544ec3e92f4f3ff789e71ec495bae7bdc01ca
4
- data.tar.gz: 23efe74dd0e37281fd701ebf86e55213c75d54c5f70ddc7e8abfeeec4608b8be
3
+ metadata.gz: dab34842325a731e13f23303b1dec66c7ab9d78b4805b982d518ba01024c9352
4
+ data.tar.gz: '0668f126b3e828c6409cc7b7adef2d74cd527f48de334b10a3d54f3767fe3afd'
5
5
  SHA512:
6
- metadata.gz: 758dace33195064f8742496b3e39408ffa64fa92324d42201f291c906b72f835245eae7259fc8aba8f8160eba3ed9041b00b91117c88144c8af72de62237875f
7
- data.tar.gz: f76c6a3d6150519fa91958183e0475a1211fa495d2d341e6a049fe6037dedcb00019e2230fdfd75194a5ab4b66d3c1b71e7226535d123b5a3476afe53fe26a0c
6
+ metadata.gz: d474705a601b7fe27be2a9c5f5e5485ed39b38dec1581db295b0e1ff524c987c9e74522afd24569829bc7e64f6930767104f9bd927a3007a17f627de003492f7
7
+ data.tar.gz: 397b84308ec62d51e1dba64cff37c75eda8ebd8c2e6a792487468158fb774aae566b8be5cd0388521774d62873e5ba6b751423bc1138ac9e3d47804bdd877a81
@@ -12,20 +12,32 @@ jobs:
12
12
  - '3.0'
13
13
  - '3.1'
14
14
  - '3.2'
15
+ - '3.3'
15
16
  - jruby
16
17
  - truffleruby
17
18
  name: Ruby ${{ matrix.ruby }}
18
19
  steps:
19
- - uses: actions/checkout@v2
20
+ - uses: actions/checkout@v4
20
21
  - name: Set up Ruby
21
22
  uses: ruby/setup-ruby@v1
22
23
  with:
23
24
  ruby-version: ${{ matrix.ruby }}
24
- - name: Install libsqlite3
25
- run: |
26
- sudo apt update -y && \
27
- sudo apt install -y --no-install-recommends --no-install-suggests libsqlite3-dev
25
+ bundler-cache: true
28
26
  - name: Install dependencies
29
27
  run: bundle install --jobs 4 --retry 3
30
28
  - name: Run tests
31
29
  run: bundle exec rake test
30
+
31
+ # rubocop linting
32
+ rubocop:
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+ - name: Set up Ruby
37
+ uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: 3.0
40
+ - name: Install dependencies
41
+ run: bundle install --jobs 4 --retry 3
42
+ - name: Run rubocop
43
+ run: bundle exec rubocop --parallel
data/.rubocop.yml ADDED
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ NewCops: enable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 3.1
5
+
6
+ inherit_gem:
7
+ rubocop-ronin: rubocop.yml
8
+
9
+ #
10
+ # ronin-web-spider specific exceptions
11
+ #
data/.yardopts CHANGED
@@ -1 +1 @@
1
- --markup markdown --title 'Ronin FIXME Documentation' --protected
1
+ --markup markdown --title 'Ronin::Web::Spider Documentation' --protected
data/ChangeLog.md CHANGED
@@ -1,6 +1,27 @@
1
- ### 0.1.0 / 2023-XX-XX
1
+ ### 0.1.1 / 2024-06-19
2
2
 
3
+ * Fixed {Ronin::Web::Spider::Agent#every_html_comment} and
4
+ {Ronin::Web::Spider::Agent#every_javascript} when the page's `Content-Type`
5
+ header included `text/html` but lacked a response body, causing `page.doc` to
6
+ be `nil`.
7
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript} where parsed
8
+ JavaScript source code strings containing UTF-8 characters where being
9
+ incorrectly encoded as ASCII-8bit strings, if the page's `Content-Type` header
10
+ did not include a `charset=` attribute.
11
+ * Fixed a bug in {Ronin::Web::Spider::Agent#every_javascript_string} where
12
+ inline JavaScript regexes containing the `"` or `'` characters (ex: `/["'=]/`)
13
+ would incorrectly be treated as the beginning or ends of JavaScript string
14
+ literals. Note that while this greatly improves the accuracy of
15
+ {Ronin::Web::Spider::Agent#every_javascript_string}, it still does not
16
+ support parsing JavaScript template literals that may also contain string
17
+ literals (ex: ````Hello \"World\"```` or ````Hello ${myFunc("string literal")}````).
18
+
19
+ ### 0.1.0 / 2023-02-01
20
+
21
+ * Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
22
+ * Relicensed as LGPL-3.0.
3
23
  * Initial release:
24
+ * Requires `ruby` >= 3.0.0.
4
25
  * Built on top of the battle tested and versatile [spidr] gem.
5
26
  * Provides additional callback methods:
6
27
  * `every_host` - yields every unique host name that's spidered.
@@ -17,3 +38,4 @@
17
38
  * `every_comment` - yields every HTML or JavaScript comment.
18
39
  * Supports archiving spidered pages to a directory or git repository.
19
40
 
41
+ [spidr]: https://github.com/postmodern/spidr#readme
data/Gemfile CHANGED
@@ -1,3 +1,4 @@
1
+ # frozen_string_literal: true
1
2
  source 'https://rubygems.org'
2
3
 
3
4
  gemspec
@@ -28,4 +29,6 @@ group :development do
28
29
  gem 'dead_end', require: false
29
30
  gem 'sord', require: false, platform: :mri
30
31
  gem 'stackprof', require: false, platform: :mri
32
+ gem 'rubocop', require: false, platform: :mri
33
+ gem 'rubocop-ronin', require: false, platform: :mri
31
34
  end
data/README.md CHANGED
@@ -2,13 +2,13 @@
2
2
 
3
3
  [![CI](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml/badge.svg)](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml)
4
4
  [![Code Climate](https://codeclimate.com/github/ronin-rb/ronin-web-spider.svg)](https://codeclimate.com/github/ronin-rb/ronin-web-spider)
5
+ [![Gem Version](https://badge.fury.io/rb/ronin-web-spider.svg)](https://badge.fury.io/rb/ronin-web-spider)
5
6
 
6
7
  * [Website](https://ronin-rb.dev/)
7
8
  * [Source](https://github.com/ronin-rb/ronin-web-spider)
8
9
  * [Issues](https://github.com/ronin-rb/ronin-web-spider/issues)
9
10
  * [Documentation](https://ronin-rb.dev/docs/ronin-web-spider/frames)
10
11
  * [Discord](https://discord.gg/6WAb3PsVX9) |
11
- [Twitter](https://twitter.com/ronin_rb) |
12
12
  [Mastodon](https://infosec.exchange/@ronin_rb)
13
13
 
14
14
  ## Description
@@ -20,22 +20,35 @@ ronin-web-spider is a collection of common web spidering routines using the
20
20
 
21
21
  * Built on top of the battle tested and versatile [spidr] gem.
22
22
  * Provides additional callback methods:
23
- * `every_host` - yields every unique host name that's spidered.
24
- * `every_cert` - yields every unique SSL/TLS certificate encountered while
25
- spidering.
26
- * `every_favicon` - yields every favicon file that's encountered while
27
- spidering.
28
- * `every_html_comment` - yields every HTML comment.
29
- * `every_javascript` - yields all JavaScript source code from either inline
30
- `<script>` or `.js` files.
31
- * `every_javascript_string` - yields every single-quoted or double-quoted
32
- String literal from all JavaScript source code.
33
- * `every_javascript_comment` - yields every JavaScript comment.
34
- * `every_comment` - yields every HTML or JavaScript comment.
23
+ * [every_host][docs-every_host] - yields every unique host name that's
24
+ spidered.
25
+ * [every_cert][docs-every_cert] - yields every unique SSL/TLS certificate
26
+ encountered while spidering.
27
+ * [every_favicon][docs-every_favicon] - yields every favicon file that's
28
+ encountered while spidering.
29
+ * [every_html_comment][docs-every_html_comment] - yields every HTML comment.
30
+ * [every_javascript][docs-every_javascript] - yields all JavaScript source
31
+ code from either inline `<script>` or `.js` files.
32
+ * [every_javascript_string][docs-every_javascript_string] - yields every
33
+ single-quoted or double-quoted String literal from all JavaScript source
34
+ code.
35
+ * [every_javascript_comment][docs-every_javascript_comment] - yields every
36
+ JavaScript comment.
37
+ * [every_comment][docs-every_comment] - yields every HTML or JavaScript
38
+ comment.
35
39
  * Supports archiving spidered pages to a directory or git repository.
36
- * Has 94% documentation coverage.
40
+ * Has 97% documentation coverage.
37
41
  * Has 94% test coverage.
38
42
 
43
+ [docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
44
+ [docs-every_cert]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_cert-instance_method
45
+ [docs-every_favicon]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_favicon-instance_method
46
+ [docs-every_html_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_html_comment-instance_method
47
+ [docs-every_javascript]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript-instance_method
48
+ [docs-every_javascript_string]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_string-instance_method
49
+ [docs-every_javascript_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_comment-instance_method
50
+ [docs-every_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_comment-instance_method
51
+
39
52
  ## Examples
40
53
 
41
54
  Spider a host:
@@ -43,41 +56,299 @@ Spider a host:
43
56
  ```ruby
44
57
  require 'ronin/web/spider'
45
58
 
46
- Ronin::Web::Spider.host('www.example.com') do |agent|
47
- agent.ever_url do |url|
48
- # ...
59
+ Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
60
+ # ...
61
+ end
62
+ ```
63
+
64
+ Spider a host:
65
+
66
+ ```ruby
67
+ Ronin::Web::Spider.host('solnic.eu') do |agent|
68
+ # ...
69
+ end
70
+ ```
71
+
72
+ Spider a domain (and any sub-domains):
73
+
74
+ ```ruby
75
+ Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
76
+ # ...
77
+ end
78
+ ```
79
+
80
+ Spider a site:
81
+
82
+ ```ruby
83
+ Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
84
+ # ...
85
+ end
86
+ ```
87
+
88
+ Spider multiple hosts:
89
+
90
+ ```ruby
91
+ Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
92
+ # ...
93
+ end
94
+ ```
95
+
96
+ Do not spider certain links:
97
+
98
+ ```ruby
99
+ Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
100
+ # ...
101
+ end
102
+ ```
103
+
104
+ Do not spider links on certain ports:
105
+
106
+ ```ruby
107
+ Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
108
+ # ...
109
+ end
110
+ ```
111
+
112
+ Do not spider links blacklisted in robots.txt:
113
+
114
+ ```ruby
115
+ Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
116
+ # ...
117
+ end
118
+ ```
119
+
120
+ Print out visited URLs:
121
+
122
+ ```ruby
123
+ Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
124
+ spider.every_url { |url| puts url }
125
+ end
126
+ ```
127
+
128
+ Build a URL map of a site:
129
+
130
+ ```ruby
131
+ url_map = Hash.new { |hash,key| hash[key] = [] }
132
+
133
+ Ronin::Web::Spider.site('http://intranet.com/') do |spider|
134
+ spider.every_link do |origin,dest|
135
+ url_map[dest] << origin
136
+ end
137
+ end
138
+ ```
139
+
140
+ Print out the URLs that could not be requested:
141
+
142
+ ```ruby
143
+ Ronin::Web::Spider.site('http://company.com/') do |spider|
144
+ spider.every_failed_url { |url| puts url }
145
+ end
146
+ ```
147
+
148
+ Finds all pages which have broken links:
149
+
150
+ ```ruby
151
+ url_map = Hash.new { |hash,key| hash[key] = [] }
152
+
153
+ spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
154
+ spider.every_link do |origin,dest|
155
+ url_map[dest] << origin
156
+ end
157
+ end
158
+
159
+ spider.failures.each do |url|
160
+ puts "Broken link #{url} found in:"
161
+
162
+ url_map[url].each { |page| puts " #{page}" }
163
+ end
164
+ ```
165
+
166
+ Search HTML and XML pages:
167
+
168
+ ```ruby
169
+ Ronin::Web::Spider.site('http://company.com/') do |spider|
170
+ spider.every_page do |page|
171
+ puts ">>> #{page.url}"
172
+
173
+ page.search('//meta').each do |meta|
174
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
175
+ value = meta.attributes['content']
176
+
177
+ puts " #{name} = #{value}"
178
+ end
179
+ end
180
+ end
181
+ ```
182
+
183
+ Print out the titles from every page:
184
+
185
+ ```ruby
186
+ Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
187
+ spider.every_html_page do |page|
188
+ puts page.title
189
+ end
190
+ end
191
+ ```
192
+
193
+ Print out every HTTP redirect:
194
+
195
+ ```ruby
196
+ Ronin::Web::Spider.host('company.com') do |spider|
197
+ spider.every_redirect_page do |page|
198
+ puts "#{page.url} -> #{page.headers['Location']}"
49
199
  end
200
+ end
201
+ ```
202
+
203
+ Find what kinds of web servers a host is using, by accessing the headers:
204
+
205
+ ```ruby
206
+ servers = Set[]
50
207
 
51
- agent.every_url_like(/.../) do |url|
52
- # ...
208
+ Ronin::Web::Spider.host('company.com') do |spider|
209
+ spider.all_headers do |headers|
210
+ servers << headers['server']
53
211
  end
212
+ end
213
+ ```
54
214
 
55
- agent.every_page do |page|
56
- # ...
215
+ Pause the spider on a forbidden page:
216
+
217
+ ```ruby
218
+ Ronin::Web::Spider.host('company.com') do |spider|
219
+ spider.every_forbidden_page do |page|
220
+ spider.pause!
57
221
  end
58
222
  end
59
223
  ```
60
224
 
61
- See [Spidr::Agent] documentation for more agent methods.
225
+ Skip the processing of a page:
62
226
 
63
- [Spidr::Agent]: https://rubydoc.info/gems/spidr/Spidr/Agent
227
+ ```ruby
228
+ Ronin::Web::Spider.host('company.com') do |spider|
229
+ spider.every_missing_page do |page|
230
+ spider.skip_page!
231
+ end
232
+ end
233
+ ```
64
234
 
65
- Spider a domain:
235
+ Skip the processing of links:
66
236
 
67
237
  ```ruby
68
- Ronin::Web::Spider.domain('example.com') do |agent|
69
- agent.every_page do |page|
70
- # ...
238
+ Ronin::Web::Spider.host('company.com') do |spider|
239
+ spider.every_url do |url|
240
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
241
+ spider.skip_link!
242
+ end
71
243
  end
72
244
  end
73
245
  ```
74
246
 
75
- Spider a website:
247
+ Detect when a new host name is spidered:
76
248
 
77
249
  ```ruby
78
- Ronin::Web::Spider.site('https://www.example.com/index.html') do |agent|
79
- agent.every_page do |page|
80
- # ...
250
+ Ronin::Web::Spider.domain('example.com') do |spider|
251
+ spider.every_host do |host|
252
+ puts "Spidering #{host} ..."
253
+ end
254
+ end
255
+ ```
256
+
257
+ Detect when a new SSL/TLS certificate is encountered:
258
+
259
+ ```ruby
260
+ Ronin::Web::Spider.domain('example.com') do |spider|
261
+ spider.every_cert do |cert|
262
+ puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
263
+ end
264
+ end
265
+ ```
266
+
267
+ Print the MD5 checksum of every `favicon.ico` file:
268
+
269
+ ```ruby
270
+ Ronin::Web::Spider.domain('example.com') do |spider|
271
+ spider.every_favicon do |page|
272
+ puts "#{page.url}: #{page.body.md5}"
273
+ end
274
+ end
275
+ ```
276
+
277
+ Print every HTML comment:
278
+
279
+ ```ruby
280
+ Ronin::Web::Spider.domain('example.com') do |spider|
281
+ spider.every_html_comment do |comment|
282
+ puts comment
283
+ end
284
+ end
285
+ ```
286
+
287
+ Print all JavaScript source code:
288
+
289
+ ```ruby
290
+ Ronin::Web::Spider.domain('example.com') do |spider|
291
+ spider.every_javascript do |js|
292
+ puts js
293
+ end
294
+ end
295
+ ```
296
+
297
+ Print every JavaScript string literal:
298
+
299
+ ```ruby
300
+ Ronin::Web::Spider.domain('example.com') do |spider|
301
+ spider.every_javascript_string do |str|
302
+ puts str
303
+ end
304
+ end
305
+ ```
306
+
307
+ Print every JavaScript comment:
308
+
309
+ ```ruby
310
+ Ronin::Web::Spider.domain('example.com') do |spider|
311
+ spider.every_javascript_comment do |comment|
312
+ puts comment
313
+ end
314
+ end
315
+ ```
316
+
317
+ Print every HTML and JavaScript comment:
318
+
319
+ ```ruby
320
+ Ronin::Web::Spider.domain('example.com') do |spider|
321
+ spider.every_comment do |comment|
322
+ puts comment
323
+ end
324
+ end
325
+ ```
326
+
327
+ Spider a host and archive every web page:
328
+
329
+ ```ruby
330
+ require 'ronin/web/spider'
331
+ require 'ronin/web/spider/archive'
332
+
333
+ Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
334
+ Ronin::Web::Spider.every_page(host: 'example.com') do |page|
335
+ archive.write(page.url,page.body)
336
+ end
337
+ end
338
+ ```
339
+
340
+ Spider a host and archive every web page to a Git repository:
341
+
342
+ ```ruby
343
+ require 'ronin/web/spider/git_archive'
344
+ require 'ronin/web/spider'
345
+ require 'date'
346
+
347
+ Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
348
+ archive.commit("Updated #{Date.today}") do
349
+ Ronin::Web::Spider.every_page(host: 'example.com') do |page|
350
+ archive.write(page.url,page.body)
351
+ end
81
352
  end
82
353
  end
83
354
  ```
@@ -119,7 +390,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
119
390
 
120
391
  ## License
121
392
 
122
- Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
393
+ Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
123
394
 
124
395
  ronin-web-spider is free software: you can redistribute it and/or modify
125
396
  it under the terms of the GNU Lesser General Public License as published
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
- require 'rubygems'
1
+ # frozen_string_literal: true
2
2
 
3
3
  begin
4
4
  require 'bundler'
5
5
  rescue LoadError => e
6
6
  warn e.message
7
7
  warn "Run `gem install bundler` to install Bundler"
8
- exit -1
8
+ exit(-1)
9
9
  end
10
10
 
11
11
  begin
data/gemspec.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  name: ronin-web-spider
2
- summary: collection of common web spidering routines
2
+ summary: A collection of common web spidering routines.
3
3
  description:
4
4
  ronin-web-spider is a collection of common web spidering routines using the
5
5
  spidr gem.
@@ -11,17 +11,17 @@ homepage: https://ronin-rb.dev/
11
11
  has_yard: true
12
12
 
13
13
  metadata:
14
- documentation_uri: https://rubydoc.info/gems/ronin-web-spider
14
+ documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
15
15
  source_code_uri: https://github.com/ronin-rb/ronin-web-spider
16
16
  bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
17
- changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/master/ChangeLog.md
17
+ changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
18
18
  rubygems_mfa_required: 'true'
19
19
 
20
20
  required_ruby_version: ">= 3.0.0"
21
21
 
22
22
  dependencies:
23
23
  spidr: ~> 0.7
24
- ronin-support: ~> 1.0.0.beta1
24
+ ronin-support: ~> 1.0
25
25
 
26
26
  development_dependencies:
27
27
  bundler: ~> 2.0