ronin-web-spider 0.1.0.beta2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -4
- data/.yardopts +1 -1
- data/ChangeLog.md +3 -0
- data/README.md +302 -30
- data/gemspec.yml +3 -3
- data/lib/ronin/web/spider/agent.rb +62 -2
- data/lib/ronin/web/spider/archive.rb +3 -0
- data/lib/ronin/web/spider/exceptions.rb +1 -1
- data/lib/ronin/web/spider/git_archive.rb +1 -1
- data/lib/ronin/web/spider/version.rb +2 -2
- data/lib/ronin/web/spider.rb +289 -1
- data/ronin-web-spider.gemspec +2 -1
- metadata +7 -17
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
|
4
|
+
data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
|
7
|
+
data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
|
data/.github/workflows/ruby.yml
CHANGED
@@ -21,10 +21,7 @@ jobs:
|
|
21
21
|
uses: ruby/setup-ruby@v1
|
22
22
|
with:
|
23
23
|
ruby-version: ${{ matrix.ruby }}
|
24
|
-
|
25
|
-
run: |
|
26
|
-
sudo apt update -y && \
|
27
|
-
sudo apt install -y --no-install-recommends --no-install-suggests libsqlite3-dev
|
24
|
+
bundler-cache: true
|
28
25
|
- name: Install dependencies
|
29
26
|
run: bundle install --jobs 4 --retry 3
|
30
27
|
- name: Run tests
|
data/.yardopts
CHANGED
@@ -1 +1 @@
|
|
1
|
-
--markup markdown --title 'Ronin
|
1
|
+
--markup markdown --title 'Ronin::Web::Spider Documentation' --protected
|
data/ChangeLog.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
### 0.1.0 / 2023-XX-XX
|
2
2
|
|
3
|
+
* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
|
4
|
+
* Relicensed as LGPL-3.0.
|
3
5
|
* Initial release:
|
6
|
+
* Requires `ruby` >= 3.0.0.
|
4
7
|
* Built on top of the battle tested and versatile [spidr] gem.
|
5
8
|
* Provides additional callback methods:
|
6
9
|
* `every_host` - yields every unique host name that's spidered.
|
data/README.md
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
[![CI](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml/badge.svg)](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml)
|
4
4
|
[![Code Climate](https://codeclimate.com/github/ronin-rb/ronin-web-spider.svg)](https://codeclimate.com/github/ronin-rb/ronin-web-spider)
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/ronin-web-spider.svg)](https://badge.fury.io/rb/ronin-web-spider)
|
5
6
|
|
6
7
|
* [Website](https://ronin-rb.dev/)
|
7
8
|
* [Source](https://github.com/ronin-rb/ronin-web-spider)
|
@@ -20,22 +21,35 @@ ronin-web-spider is a collection of common web spidering routines using the
|
|
20
21
|
|
21
22
|
* Built on top of the battle tested and versatile [spidr] gem.
|
22
23
|
* Provides additional callback methods:
|
23
|
-
*
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
*
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
24
|
+
* [every_host][docs-every_host] - yields every unique host name that's
|
25
|
+
spidered.
|
26
|
+
* [every_cert][docs-every_cert] - yields every unique SSL/TLS certificate
|
27
|
+
encountered while spidering.
|
28
|
+
* [every_favicon][docs-every_favicon] - yields every favicon file that's
|
29
|
+
encountered while spidering.
|
30
|
+
* [every_html_comment][docs-every_html_comment] - yields every HTML comment.
|
31
|
+
* [every_javascript][docs-every_javascript] - yields all JavaScript source
|
32
|
+
code from either inline `<script>` or `.js` files.
|
33
|
+
* [every_javascript_string][docs-every_javascript_string] - yields every
|
34
|
+
single-quoted or double-quoted String literal from all JavaScript source
|
35
|
+
code.
|
36
|
+
* [every_javascript_comment][docs-every_javascript_comment] - yields every
|
37
|
+
JavaScript comment.
|
38
|
+
* [every_comment][docs-every_comment] - yields every HTML or JavaScript
|
39
|
+
comment.
|
35
40
|
* Supports archiving spidered pages to a directory or git repository.
|
36
41
|
* Has 94% documentation coverage.
|
37
42
|
* Has 94% test coverage.
|
38
43
|
|
44
|
+
[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
|
45
|
+
[docs-every_cert]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_cert-instance_method
|
46
|
+
[docs-every_favicon]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_favicon-instance_method
|
47
|
+
[docs-every_html_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_html_comment-instance_method
|
48
|
+
[docs-every_javascript]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript-instance_method
|
49
|
+
[docs-every_javascript_string]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_string-instance_method
|
50
|
+
[docs-every_javascript_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_comment-instance_method
|
51
|
+
[docs-every_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_comment-instance_method
|
52
|
+
|
39
53
|
## Examples
|
40
54
|
|
41
55
|
Spider a host:
|
@@ -43,41 +57,299 @@ Spider a host:
|
|
43
57
|
```ruby
|
44
58
|
require 'ronin/web/spider'
|
45
59
|
|
46
|
-
Ronin::Web::Spider.
|
47
|
-
|
48
|
-
|
60
|
+
Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
61
|
+
# ...
|
62
|
+
end
|
63
|
+
```
|
64
|
+
|
65
|
+
Spider a host:
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
Ronin::Web::Spider.host('solnic.eu') do |agent|
|
69
|
+
# ...
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
Spider a domain (and any sub-domains):
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
77
|
+
# ...
|
78
|
+
end
|
79
|
+
```
|
80
|
+
|
81
|
+
Spider a site:
|
82
|
+
|
83
|
+
```ruby
|
84
|
+
Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
85
|
+
# ...
|
86
|
+
end
|
87
|
+
```
|
88
|
+
|
89
|
+
Spider multiple hosts:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
93
|
+
# ...
|
94
|
+
end
|
95
|
+
```
|
96
|
+
|
97
|
+
Do not spider certain links:
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
101
|
+
# ...
|
102
|
+
end
|
103
|
+
```
|
104
|
+
|
105
|
+
Do not spider links on certain ports:
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
109
|
+
# ...
|
110
|
+
end
|
111
|
+
```
|
112
|
+
|
113
|
+
Do not spider links blacklisted in robots.txt:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
117
|
+
# ...
|
118
|
+
end
|
119
|
+
```
|
120
|
+
|
121
|
+
Print out visited URLs:
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
125
|
+
spider.every_url { |url| puts url }
|
126
|
+
end
|
127
|
+
```
|
128
|
+
|
129
|
+
Build a URL map of a site:
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
133
|
+
|
134
|
+
Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
135
|
+
spider.every_link do |origin,dest|
|
136
|
+
url_map[dest] << origin
|
137
|
+
end
|
138
|
+
end
|
139
|
+
```
|
140
|
+
|
141
|
+
Print out the URLs that could not be requested:
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
Ronin::Web::Spider.site('http://company.com/') do |spider|
|
145
|
+
spider.every_failed_url { |url| puts url }
|
146
|
+
end
|
147
|
+
```
|
148
|
+
|
149
|
+
Finds all pages which have broken links:
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
153
|
+
|
154
|
+
spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
155
|
+
spider.every_link do |origin,dest|
|
156
|
+
url_map[dest] << origin
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
spider.failures.each do |url|
|
161
|
+
puts "Broken link #{url} found in:"
|
162
|
+
|
163
|
+
url_map[url].each { |page| puts " #{page}" }
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
Search HTML and XML pages:
|
168
|
+
|
169
|
+
```ruby
|
170
|
+
Ronin::Web::Spider.site('http://company.com/') do |spider|
|
171
|
+
spider.every_page do |page|
|
172
|
+
puts ">>> #{page.url}"
|
173
|
+
|
174
|
+
page.search('//meta').each do |meta|
|
175
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
176
|
+
value = meta.attributes['content']
|
177
|
+
|
178
|
+
puts " #{name} = #{value}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
```
|
183
|
+
|
184
|
+
Print out the titles from every page:
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
188
|
+
spider.every_html_page do |page|
|
189
|
+
puts page.title
|
190
|
+
end
|
191
|
+
end
|
192
|
+
```
|
193
|
+
|
194
|
+
Print out every HTTP redirect:
|
195
|
+
|
196
|
+
```ruby
|
197
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
198
|
+
spider.every_redirect_page do |page|
|
199
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
49
200
|
end
|
201
|
+
end
|
202
|
+
```
|
203
|
+
|
204
|
+
Find what kinds of web servers a host is using, by accessing the headers:
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
servers = Set[]
|
50
208
|
|
51
|
-
|
52
|
-
|
209
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
210
|
+
spider.all_headers do |headers|
|
211
|
+
servers << headers['server']
|
53
212
|
end
|
213
|
+
end
|
214
|
+
```
|
54
215
|
|
55
|
-
|
56
|
-
|
216
|
+
Pause the spider on a forbidden page:
|
217
|
+
|
218
|
+
```ruby
|
219
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
220
|
+
spider.every_forbidden_page do |page|
|
221
|
+
spider.pause!
|
57
222
|
end
|
58
223
|
end
|
59
224
|
```
|
60
225
|
|
61
|
-
|
226
|
+
Skip the processing of a page:
|
62
227
|
|
63
|
-
|
228
|
+
```ruby
|
229
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
230
|
+
spider.every_missing_page do |page|
|
231
|
+
spider.skip_page!
|
232
|
+
end
|
233
|
+
end
|
234
|
+
```
|
64
235
|
|
65
|
-
|
236
|
+
Skip the processing of links:
|
66
237
|
|
67
238
|
```ruby
|
68
|
-
Ronin::Web::Spider.
|
69
|
-
|
70
|
-
|
239
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
240
|
+
spider.every_url do |url|
|
241
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
242
|
+
spider.skip_link!
|
243
|
+
end
|
71
244
|
end
|
72
245
|
end
|
73
246
|
```
|
74
247
|
|
75
|
-
|
248
|
+
Detect when a new host name is spidered:
|
76
249
|
|
77
250
|
```ruby
|
78
|
-
Ronin::Web::Spider.
|
79
|
-
|
80
|
-
# ...
|
251
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
252
|
+
spider.every_host do |host|
|
253
|
+
puts "Spidering #{host} ..."
|
254
|
+
end
|
255
|
+
end
|
256
|
+
```
|
257
|
+
|
258
|
+
Detect when a new SSL/TLS certificate is encountered:
|
259
|
+
|
260
|
+
```ruby
|
261
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
262
|
+
spider.every_cert do |cert|
|
263
|
+
puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
264
|
+
end
|
265
|
+
end
|
266
|
+
```
|
267
|
+
|
268
|
+
Print the MD5 checksum of every `favicon.ico` file:
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
272
|
+
spider.every_favicon do |page|
|
273
|
+
puts "#{page.url}: #{page.body.md5}"
|
274
|
+
end
|
275
|
+
end
|
276
|
+
```
|
277
|
+
|
278
|
+
Print every HTML comment:
|
279
|
+
|
280
|
+
```ruby
|
281
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
282
|
+
spider.every_html_comment do |comment|
|
283
|
+
puts comment
|
284
|
+
end
|
285
|
+
end
|
286
|
+
```
|
287
|
+
|
288
|
+
Print all JavaScript source code:
|
289
|
+
|
290
|
+
```ruby
|
291
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
292
|
+
spider.every_javascript do |js|
|
293
|
+
puts js
|
294
|
+
end
|
295
|
+
end
|
296
|
+
```
|
297
|
+
|
298
|
+
Print every JavaScript string literal:
|
299
|
+
|
300
|
+
```ruby
|
301
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
302
|
+
spider.every_javascript_string do |str|
|
303
|
+
puts str
|
304
|
+
end
|
305
|
+
end
|
306
|
+
```
|
307
|
+
|
308
|
+
Print every JavaScript comment:
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
312
|
+
spider.every_javascript_comment do |comment|
|
313
|
+
puts comment
|
314
|
+
end
|
315
|
+
end
|
316
|
+
```
|
317
|
+
|
318
|
+
Print every HTML and JavaScript comment:
|
319
|
+
|
320
|
+
```ruby
|
321
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
322
|
+
spider.every_comment do |comment|
|
323
|
+
puts comment
|
324
|
+
end
|
325
|
+
end
|
326
|
+
```
|
327
|
+
|
328
|
+
Spider a host and archive every web page:
|
329
|
+
|
330
|
+
```ruby
|
331
|
+
require 'ronin/web/spider'
|
332
|
+
require 'ronin/web/spider/archive'
|
333
|
+
|
334
|
+
Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
335
|
+
Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
336
|
+
archive.write(page.url,page.body)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
```
|
340
|
+
|
341
|
+
Spider a host and archive every web page to a Git repository:
|
342
|
+
|
343
|
+
```ruby
|
344
|
+
require 'ronin/web/spider/git_archive'
|
345
|
+
require 'ronin/web/spider'
|
346
|
+
require 'date'
|
347
|
+
|
348
|
+
Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
349
|
+
archive.commit("Updated #{Date.today}") do
|
350
|
+
Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
351
|
+
archive.write(page.url,page.body)
|
352
|
+
end
|
81
353
|
end
|
82
354
|
end
|
83
355
|
```
|
@@ -119,7 +391,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
|
|
119
391
|
|
120
392
|
## License
|
121
393
|
|
122
|
-
Copyright (c) 2006-
|
394
|
+
Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
123
395
|
|
124
396
|
ronin-web-spider is free software: you can redistribute it and/or modify
|
125
397
|
it under the terms of the GNU Lesser General Public License as published
|
data/gemspec.yml
CHANGED
@@ -11,17 +11,17 @@ homepage: https://ronin-rb.dev/
|
|
11
11
|
has_yard: true
|
12
12
|
|
13
13
|
metadata:
|
14
|
-
documentation_uri: https://
|
14
|
+
documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
|
15
15
|
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
16
16
|
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
17
|
-
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/
|
17
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
|
18
18
|
rubygems_mfa_required: 'true'
|
19
19
|
|
20
20
|
required_ruby_version: ">= 3.0.0"
|
21
21
|
|
22
22
|
dependencies:
|
23
23
|
spidr: ~> 0.7
|
24
|
-
ronin-support: ~> 1.0
|
24
|
+
ronin-support: ~> 1.0
|
25
25
|
|
26
26
|
development_dependencies:
|
27
27
|
bundler: ~> 2.0
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -122,6 +122,8 @@ module Ronin
|
|
122
122
|
# The visited host names.
|
123
123
|
#
|
124
124
|
# @return [Set<String>, nil]
|
125
|
+
#
|
126
|
+
# @api public
|
125
127
|
attr_reader :visited_hosts
|
126
128
|
|
127
129
|
#
|
@@ -132,6 +134,13 @@ module Ronin
|
|
132
134
|
#
|
133
135
|
# @yieldparam [String] host
|
134
136
|
#
|
137
|
+
# @example
|
138
|
+
# spider.every_host do |host|
|
139
|
+
# puts "Spidring #{host} ..."
|
140
|
+
# end
|
141
|
+
#
|
142
|
+
# @api public
|
143
|
+
#
|
135
144
|
def every_host
|
136
145
|
@visited_hosts ||= Set.new
|
137
146
|
|
@@ -147,6 +156,8 @@ module Ronin
|
|
147
156
|
# All certificates encountered while spidering.
|
148
157
|
#
|
149
158
|
# @return [Array<Ronin::Support::Crypto::Cert>]
|
159
|
+
#
|
160
|
+
# @api public
|
150
161
|
attr_reader :collected_certs
|
151
162
|
|
152
163
|
#
|
@@ -157,6 +168,13 @@ module Ronin
|
|
157
168
|
#
|
158
169
|
# @yieldparam [Ronin::Support::Crypto::Cert]
|
159
170
|
#
|
171
|
+
# @example
|
172
|
+
# spider.every_cert do |cert|
|
173
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
174
|
+
# end
|
175
|
+
#
|
176
|
+
# @api public
|
177
|
+
#
|
160
178
|
def every_cert
|
161
179
|
@collected_certs ||= []
|
162
180
|
|
@@ -185,8 +203,15 @@ module Ronin
|
|
185
203
|
# @yieldparam [Spidr::Page] favicon
|
186
204
|
# An encountered `.ico` file.
|
187
205
|
#
|
206
|
+
# @example
|
207
|
+
# spider.every_favicon do |page|
|
208
|
+
# # ...
|
209
|
+
# end
|
210
|
+
#
|
188
211
|
# @see https://rubydoc.info/gems/spidr/Spidr/Page
|
189
212
|
#
|
213
|
+
# @api public
|
214
|
+
#
|
190
215
|
def every_favicon
|
191
216
|
every_page do |page|
|
192
217
|
yield page if page.icon?
|
@@ -197,12 +222,19 @@ module Ronin
|
|
197
222
|
# Passes every non-empty HTML comment to the given block.
|
198
223
|
#
|
199
224
|
# @yield [comment]
|
200
|
-
# The given block will be
|
225
|
+
# The given block will be pass every HTML comment.
|
201
226
|
#
|
202
227
|
# @yieldparam [String] comment
|
203
228
|
# The HTML comment inner text, with leading and trailing whitespace
|
204
229
|
# stripped.
|
205
230
|
#
|
231
|
+
# @example
|
232
|
+
# spider.every_html_comment do |comment|
|
233
|
+
# puts comment
|
234
|
+
# end
|
235
|
+
#
|
236
|
+
# @api public
|
237
|
+
#
|
206
238
|
def every_html_comment
|
207
239
|
every_html_page do |page|
|
208
240
|
page.doc.xpath('//comment()').each do |comment|
|
@@ -224,6 +256,13 @@ module Ronin
|
|
224
256
|
# @yieldparam [String] js
|
225
257
|
# The JavaScript source code.
|
226
258
|
#
|
259
|
+
# @example
|
260
|
+
# spider.every_javascript do |js|
|
261
|
+
# puts js
|
262
|
+
# end
|
263
|
+
#
|
264
|
+
# @api public
|
265
|
+
#
|
227
266
|
def every_javascript
|
228
267
|
# yield inner text of every `<script type="text/javascript">` tag
|
229
268
|
# and every `.js` URL.
|
@@ -252,6 +291,13 @@ module Ronin
|
|
252
291
|
# @yieldparam [String] string
|
253
292
|
# The parsed contents of a JavaScript string.
|
254
293
|
#
|
294
|
+
# @example
|
295
|
+
# spider.every_javascript_string do |str|
|
296
|
+
# puts str
|
297
|
+
# end
|
298
|
+
#
|
299
|
+
# @api public
|
300
|
+
#
|
255
301
|
def every_javascript_string
|
256
302
|
every_javascript do |js|
|
257
303
|
js.scan(Support::Text::Patterns::STRING) do |js_string|
|
@@ -271,6 +317,13 @@ module Ronin
|
|
271
317
|
# @yieldparam [String] comment
|
272
318
|
# The contents of a JavaScript comment.
|
273
319
|
#
|
320
|
+
# @example
|
321
|
+
# spider.every_javascript_comment do |comment|
|
322
|
+
# puts comment
|
323
|
+
# end
|
324
|
+
#
|
325
|
+
# @api public
|
326
|
+
#
|
274
327
|
def every_javascript_comment(&block)
|
275
328
|
every_javascript do |js|
|
276
329
|
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
|
@@ -288,9 +341,16 @@ module Ronin
|
|
288
341
|
# @yieldparam [String] comment
|
289
342
|
# The contents of a HTML or JavaScript comment.
|
290
343
|
#
|
344
|
+
# @example
|
345
|
+
# spider.every_comment do |comment|
|
346
|
+
# puts comment
|
347
|
+
# end
|
348
|
+
#
|
291
349
|
# @see #every_html_comment
|
292
350
|
# @see #every_javascript_comment
|
293
351
|
#
|
352
|
+
# @api public
|
353
|
+
#
|
294
354
|
def every_comment(&block)
|
295
355
|
every_html_comment(&block)
|
296
356
|
every_javascript_comment(&block)
|
@@ -29,6 +29,9 @@ module Ronin
|
|
29
29
|
#
|
30
30
|
# Spider a host and archive every web page:
|
31
31
|
#
|
32
|
+
# require 'ronin/web/spider'
|
33
|
+
# require 'ronin/web/spider/archive'
|
34
|
+
#
|
32
35
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
33
36
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
34
37
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -30,8 +30,8 @@ module Ronin
|
|
30
30
|
#
|
31
31
|
# Spider a host and archive every web page to a Git repository:
|
32
32
|
#
|
33
|
-
# require 'ronin/web/spider/git_archive'
|
34
33
|
# require 'ronin/web/spider'
|
34
|
+
# require 'ronin/web/spider/git_archive'
|
35
35
|
# require 'date'
|
36
36
|
#
|
37
37
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +21,7 @@ module Ronin
|
|
21
21
|
module Web
|
22
22
|
module Spider
|
23
23
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.1.0
|
24
|
+
VERSION = '0.1.0'
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|