ronin-web-spider 0.1.0.beta1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +1 -4
- data/.yardopts +1 -1
- data/ChangeLog.md +3 -0
- data/Gemfile +2 -2
- data/README.md +302 -30
- data/gemspec.yml +2 -2
- data/lib/ronin/web/spider/agent.rb +62 -2
- data/lib/ronin/web/spider/archive.rb +3 -0
- data/lib/ronin/web/spider/exceptions.rb +1 -1
- data/lib/ronin/web/spider/git_archive.rb +1 -1
- data/lib/ronin/web/spider/version.rb +2 -2
- data/lib/ronin/web/spider.rb +289 -1
- data/ronin-web-spider.gemspec +2 -1
- metadata +5 -15
- data/spec/agent_spec.rb +0 -585
- data/spec/archive_spec.rb +0 -91
- data/spec/example_app.rb +0 -27
- data/spec/git_archive_spec.rb +0 -137
- data/spec/spec_helper.rb +0 -4
- data/spec/spider_spec.rb +0 -252
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fcb3d69132ae37799758c37282083f3b876e04e76aa3ab9f500f251b7df0984d
|
4
|
+
data.tar.gz: 04b92b26f1bcd6166530ddfe225cde18a4bbaa8a1eb3b395120ae1e6b41aec4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5cc4d39ac8e5f9d92edd240e836d5848f0b96798afbcab9c8116f8223142851d835b7bfd3e7a8d94e867951c4b995e0a66736a73b72d6a96f06fee6daf26bc9
|
7
|
+
data.tar.gz: 4f1facfbdffe1aca7fd0d10ff0c99d6f835b2633e94be49011b46127ca9cc7b76415930d5df0a961516000032b940f00e224c562923c06412c57f2896e50256f
|
data/.github/workflows/ruby.yml
CHANGED
@@ -21,10 +21,7 @@ jobs:
|
|
21
21
|
uses: ruby/setup-ruby@v1
|
22
22
|
with:
|
23
23
|
ruby-version: ${{ matrix.ruby }}
|
24
|
-
|
25
|
-
run: |
|
26
|
-
sudo apt update -y && \
|
27
|
-
sudo apt install -y --no-install-recommends --no-install-suggests libsqlite3-dev
|
24
|
+
bundler-cache: true
|
28
25
|
- name: Install dependencies
|
29
26
|
run: bundle install --jobs 4 --retry 3
|
30
27
|
- name: Run tests
|
data/.yardopts
CHANGED
@@ -1 +1 @@
|
|
1
|
-
--markup markdown --title 'Ronin
|
1
|
+
--markup markdown --title 'Ronin::Web::Spider Documentation' --protected
|
data/ChangeLog.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
### 0.1.0 / 2023-XX-XX
|
2
2
|
|
3
|
+
* Extracted and refactored from [ronin-web](https://github.com/ronin-rb/ronin-web/tree/v0.3.0.rc1).
|
4
|
+
* Relicensed as LGPL-3.0.
|
3
5
|
* Initial release:
|
6
|
+
* Requires `ruby` >= 3.0.0.
|
4
7
|
* Built on top of the battle tested and versatile [spidr] gem.
|
5
8
|
* Provides additional callback methods:
|
6
9
|
* `every_host` - yields every unique host name that's spidered.
|
data/Gemfile
CHANGED
@@ -8,8 +8,8 @@ end
|
|
8
8
|
|
9
9
|
# gem 'spidr', '~> 0.7', github: 'postmodern/spidr'
|
10
10
|
|
11
|
-
gem 'ronin-support', '~> 1.0', github: "ronin-rb/ronin-support",
|
12
|
-
|
11
|
+
# gem 'ronin-support', '~> 1.0', github: "ronin-rb/ronin-support",
|
12
|
+
# branch: 'main'
|
13
13
|
|
14
14
|
group :development do
|
15
15
|
gem 'rake'
|
data/README.md
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
[](https://github.com/ronin-rb/ronin-web-spider/actions/workflows/ruby.yml)
|
4
4
|
[](https://codeclimate.com/github/ronin-rb/ronin-web-spider)
|
5
|
+
[](https://badge.fury.io/rb/ronin-web-spider)
|
5
6
|
|
6
7
|
* [Website](https://ronin-rb.dev/)
|
7
8
|
* [Source](https://github.com/ronin-rb/ronin-web-spider)
|
@@ -20,22 +21,35 @@ ronin-web-spider is a collection of common web spidering routines using the
|
|
20
21
|
|
21
22
|
* Built on top of the battle tested and versatile [spidr] gem.
|
22
23
|
* Provides additional callback methods:
|
23
|
-
*
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
*
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
24
|
+
* [every_host][docs-every_host] - yields every unique host name that's
|
25
|
+
spidered.
|
26
|
+
* [every_cert][docs-every_cert] - yields every unique SSL/TLS certificate
|
27
|
+
encountered while spidering.
|
28
|
+
* [every_favicon][docs-every_favicon] - yields every favicon file that's
|
29
|
+
encountered while spidering.
|
30
|
+
* [every_html_comment][docs-every_html_comment] - yields every HTML comment.
|
31
|
+
* [every_javascript][docs-every_javascript] - yields all JavaScript source
|
32
|
+
code from either inline `<script>` or `.js` files.
|
33
|
+
* [every_javascript_string][docs-every_javascript_string] - yields every
|
34
|
+
single-quoted or double-quoted String literal from all JavaScript source
|
35
|
+
code.
|
36
|
+
* [every_javascript_comment][docs-every_javascript_comment] - yields every
|
37
|
+
JavaScript comment.
|
38
|
+
* [every_comment][docs-every_comment] - yields every HTML or JavaScript
|
39
|
+
comment.
|
35
40
|
* Supports archiving spidered pages to a directory or git repository.
|
36
41
|
* Has 94% documentation coverage.
|
37
42
|
* Has 94% test coverage.
|
38
43
|
|
44
|
+
[docs-every_host]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_host-instance_method
|
45
|
+
[docs-every_cert]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_cert-instance_method
|
46
|
+
[docs-every_favicon]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_favicon-instance_method
|
47
|
+
[docs-every_html_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_html_comment-instance_method
|
48
|
+
[docs-every_javascript]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript-instance_method
|
49
|
+
[docs-every_javascript_string]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_string-instance_method
|
50
|
+
[docs-every_javascript_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_javascript_comment-instance_method
|
51
|
+
[docs-every_comment]: https://ronin-rb.dev/docs/ronin-web-spider/Ronin/Web/Spider/Agent.html#every_comment-instance_method
|
52
|
+
|
39
53
|
## Examples
|
40
54
|
|
41
55
|
Spider a host:
|
@@ -43,41 +57,299 @@ Spider a host:
|
|
43
57
|
```ruby
|
44
58
|
require 'ronin/web/spider'
|
45
59
|
|
46
|
-
Ronin::Web::Spider.
|
47
|
-
|
48
|
-
|
60
|
+
Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent|
|
61
|
+
# ...
|
62
|
+
end
|
63
|
+
```
|
64
|
+
|
65
|
+
Spider a host:
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
Ronin::Web::Spider.host('solnic.eu') do |agent|
|
69
|
+
# ...
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
Spider a domain (and any sub-domains):
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
Ronin::Web::Spider.domain('ruby-lang.org') do |agent|
|
77
|
+
# ...
|
78
|
+
end
|
79
|
+
```
|
80
|
+
|
81
|
+
Spider a site:
|
82
|
+
|
83
|
+
```ruby
|
84
|
+
Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent|
|
85
|
+
# ...
|
86
|
+
end
|
87
|
+
```
|
88
|
+
|
89
|
+
Spider multiple hosts:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent|
|
93
|
+
# ...
|
94
|
+
end
|
95
|
+
```
|
96
|
+
|
97
|
+
Do not spider certain links:
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent|
|
101
|
+
# ...
|
102
|
+
end
|
103
|
+
```
|
104
|
+
|
105
|
+
Do not spider links on certain ports:
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent|
|
109
|
+
# ...
|
110
|
+
end
|
111
|
+
```
|
112
|
+
|
113
|
+
Do not spider links blacklisted in robots.txt:
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent|
|
117
|
+
# ...
|
118
|
+
end
|
119
|
+
```
|
120
|
+
|
121
|
+
Print out visited URLs:
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider|
|
125
|
+
spider.every_url { |url| puts url }
|
126
|
+
end
|
127
|
+
```
|
128
|
+
|
129
|
+
Build a URL map of a site:
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
133
|
+
|
134
|
+
Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
135
|
+
spider.every_link do |origin,dest|
|
136
|
+
url_map[dest] << origin
|
137
|
+
end
|
138
|
+
end
|
139
|
+
```
|
140
|
+
|
141
|
+
Print out the URLs that could not be requested:
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
Ronin::Web::Spider.site('http://company.com/') do |spider|
|
145
|
+
spider.every_failed_url { |url| puts url }
|
146
|
+
end
|
147
|
+
```
|
148
|
+
|
149
|
+
Finds all pages which have broken links:
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
url_map = Hash.new { |hash,key| hash[key] = [] }
|
153
|
+
|
154
|
+
spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider|
|
155
|
+
spider.every_link do |origin,dest|
|
156
|
+
url_map[dest] << origin
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
spider.failures.each do |url|
|
161
|
+
puts "Broken link #{url} found in:"
|
162
|
+
|
163
|
+
url_map[url].each { |page| puts " #{page}" }
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
Search HTML and XML pages:
|
168
|
+
|
169
|
+
```ruby
|
170
|
+
Ronin::Web::Spider.site('http://company.com/') do |spider|
|
171
|
+
spider.every_page do |page|
|
172
|
+
puts ">>> #{page.url}"
|
173
|
+
|
174
|
+
page.search('//meta').each do |meta|
|
175
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
176
|
+
value = meta.attributes['content']
|
177
|
+
|
178
|
+
puts " #{name} = #{value}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
```
|
183
|
+
|
184
|
+
Print out the titles from every page:
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider|
|
188
|
+
spider.every_html_page do |page|
|
189
|
+
puts page.title
|
190
|
+
end
|
191
|
+
end
|
192
|
+
```
|
193
|
+
|
194
|
+
Print out every HTTP redirect:
|
195
|
+
|
196
|
+
```ruby
|
197
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
198
|
+
spider.every_redirect_page do |page|
|
199
|
+
puts "#{page.url} -> #{page.headers['Location']}"
|
49
200
|
end
|
201
|
+
end
|
202
|
+
```
|
203
|
+
|
204
|
+
Find what kinds of web servers a host is using, by accessing the headers:
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
servers = Set[]
|
50
208
|
|
51
|
-
|
52
|
-
|
209
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
210
|
+
spider.all_headers do |headers|
|
211
|
+
servers << headers['server']
|
53
212
|
end
|
213
|
+
end
|
214
|
+
```
|
54
215
|
|
55
|
-
|
56
|
-
|
216
|
+
Pause the spider on a forbidden page:
|
217
|
+
|
218
|
+
```ruby
|
219
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
220
|
+
spider.every_forbidden_page do |page|
|
221
|
+
spider.pause!
|
57
222
|
end
|
58
223
|
end
|
59
224
|
```
|
60
225
|
|
61
|
-
|
226
|
+
Skip the processing of a page:
|
62
227
|
|
63
|
-
|
228
|
+
```ruby
|
229
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
230
|
+
spider.every_missing_page do |page|
|
231
|
+
spider.skip_page!
|
232
|
+
end
|
233
|
+
end
|
234
|
+
```
|
64
235
|
|
65
|
-
|
236
|
+
Skip the processing of links:
|
66
237
|
|
67
238
|
```ruby
|
68
|
-
Ronin::Web::Spider.
|
69
|
-
|
70
|
-
|
239
|
+
Ronin::Web::Spider.host('company.com') do |spider|
|
240
|
+
spider.every_url do |url|
|
241
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
242
|
+
spider.skip_link!
|
243
|
+
end
|
71
244
|
end
|
72
245
|
end
|
73
246
|
```
|
74
247
|
|
75
|
-
|
248
|
+
Detect when a new host name is spidered:
|
76
249
|
|
77
250
|
```ruby
|
78
|
-
Ronin::Web::Spider.
|
79
|
-
|
80
|
-
# ...
|
251
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
252
|
+
spider.every_host do |host|
|
253
|
+
puts "Spidering #{host} ..."
|
254
|
+
end
|
255
|
+
end
|
256
|
+
```
|
257
|
+
|
258
|
+
Detect when a new SSL/TLS certificate is encountered:
|
259
|
+
|
260
|
+
```ruby
|
261
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
262
|
+
spider.every_cert do |cert|
|
263
|
+
puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
264
|
+
end
|
265
|
+
end
|
266
|
+
```
|
267
|
+
|
268
|
+
Print the MD5 checksum of every `favicon.ico` file:
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
272
|
+
spider.every_favicon do |page|
|
273
|
+
puts "#{page.url}: #{page.body.md5}"
|
274
|
+
end
|
275
|
+
end
|
276
|
+
```
|
277
|
+
|
278
|
+
Print every HTML comment:
|
279
|
+
|
280
|
+
```ruby
|
281
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
282
|
+
spider.every_html_comment do |comment|
|
283
|
+
puts comment
|
284
|
+
end
|
285
|
+
end
|
286
|
+
```
|
287
|
+
|
288
|
+
Print all JavaScript source code:
|
289
|
+
|
290
|
+
```ruby
|
291
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
292
|
+
spider.every_javascript do |js|
|
293
|
+
puts js
|
294
|
+
end
|
295
|
+
end
|
296
|
+
```
|
297
|
+
|
298
|
+
Print every JavaScript string literal:
|
299
|
+
|
300
|
+
```ruby
|
301
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
302
|
+
spider.every_javascript_string do |str|
|
303
|
+
puts str
|
304
|
+
end
|
305
|
+
end
|
306
|
+
```
|
307
|
+
|
308
|
+
Print every JavaScript comment:
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
312
|
+
spider.every_javascript_comment do |comment|
|
313
|
+
puts comment
|
314
|
+
end
|
315
|
+
end
|
316
|
+
```
|
317
|
+
|
318
|
+
Print every HTML and JavaScript comment:
|
319
|
+
|
320
|
+
```ruby
|
321
|
+
Ronin::Web::Spider.domain('example.com') do |spider|
|
322
|
+
spider.every_comment do |comment|
|
323
|
+
puts comment
|
324
|
+
end
|
325
|
+
end
|
326
|
+
```
|
327
|
+
|
328
|
+
Spider a host and archive every web page:
|
329
|
+
|
330
|
+
```ruby
|
331
|
+
require 'ronin/web/spider'
|
332
|
+
require 'ronin/web/spider/archive'
|
333
|
+
|
334
|
+
Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
335
|
+
Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
336
|
+
archive.write(page.url,page.body)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
```
|
340
|
+
|
341
|
+
Spider a host and archive every web page to a Git repository:
|
342
|
+
|
343
|
+
```ruby
|
344
|
+
require 'ronin/web/spider/git_archive'
|
345
|
+
require 'ronin/web/spider'
|
346
|
+
require 'date'
|
347
|
+
|
348
|
+
Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
349
|
+
archive.commit("Updated #{Date.today}") do
|
350
|
+
Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
351
|
+
archive.write(page.url,page.body)
|
352
|
+
end
|
81
353
|
end
|
82
354
|
end
|
83
355
|
```
|
@@ -119,7 +391,7 @@ gem.add_dependency 'ronin-web-spider', '~> 0.1'
|
|
119
391
|
|
120
392
|
## License
|
121
393
|
|
122
|
-
Copyright (c) 2006-
|
394
|
+
Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
123
395
|
|
124
396
|
ronin-web-spider is free software: you can redistribute it and/or modify
|
125
397
|
it under the terms of the GNU Lesser General Public License as published
|
data/gemspec.yml
CHANGED
@@ -11,10 +11,10 @@ homepage: https://ronin-rb.dev/
|
|
11
11
|
has_yard: true
|
12
12
|
|
13
13
|
metadata:
|
14
|
-
documentation_uri: https://
|
14
|
+
documentation_uri: https://ronin-rb.dev/docs/ronin-web-spider
|
15
15
|
source_code_uri: https://github.com/ronin-rb/ronin-web-spider
|
16
16
|
bug_tracker_uri: https://github.com/ronin-rb/ronin-web-spider/issues
|
17
|
-
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/
|
17
|
+
changelog_uri: https://github.com/ronin-rb/ronin-web-spider/blob/main/ChangeLog.md
|
18
18
|
rubygems_mfa_required: 'true'
|
19
19
|
|
20
20
|
required_ruby_version: ">= 3.0.0"
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -122,6 +122,8 @@ module Ronin
|
|
122
122
|
# The visited host names.
|
123
123
|
#
|
124
124
|
# @return [Set<String>, nil]
|
125
|
+
#
|
126
|
+
# @api public
|
125
127
|
attr_reader :visited_hosts
|
126
128
|
|
127
129
|
#
|
@@ -132,6 +134,13 @@ module Ronin
|
|
132
134
|
#
|
133
135
|
# @yieldparam [String] host
|
134
136
|
#
|
137
|
+
# @example
|
138
|
+
# spider.every_host do |host|
|
139
|
+
# puts "Spidring #{host} ..."
|
140
|
+
# end
|
141
|
+
#
|
142
|
+
# @api public
|
143
|
+
#
|
135
144
|
def every_host
|
136
145
|
@visited_hosts ||= Set.new
|
137
146
|
|
@@ -147,6 +156,8 @@ module Ronin
|
|
147
156
|
# All certificates encountered while spidering.
|
148
157
|
#
|
149
158
|
# @return [Array<Ronin::Support::Crypto::Cert>]
|
159
|
+
#
|
160
|
+
# @api public
|
150
161
|
attr_reader :collected_certs
|
151
162
|
|
152
163
|
#
|
@@ -157,6 +168,13 @@ module Ronin
|
|
157
168
|
#
|
158
169
|
# @yieldparam [Ronin::Support::Crypto::Cert]
|
159
170
|
#
|
171
|
+
# @example
|
172
|
+
# spider.every_cert do |cert|
|
173
|
+
# puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}"
|
174
|
+
# end
|
175
|
+
#
|
176
|
+
# @api public
|
177
|
+
#
|
160
178
|
def every_cert
|
161
179
|
@collected_certs ||= []
|
162
180
|
|
@@ -185,8 +203,15 @@ module Ronin
|
|
185
203
|
# @yieldparam [Spidr::Page] favicon
|
186
204
|
# An encountered `.ico` file.
|
187
205
|
#
|
206
|
+
# @example
|
207
|
+
# spider.every_favicon do |page|
|
208
|
+
# # ...
|
209
|
+
# end
|
210
|
+
#
|
188
211
|
# @see https://rubydoc.info/gems/spidr/Spidr/Page
|
189
212
|
#
|
213
|
+
# @api public
|
214
|
+
#
|
190
215
|
def every_favicon
|
191
216
|
every_page do |page|
|
192
217
|
yield page if page.icon?
|
@@ -197,12 +222,19 @@ module Ronin
|
|
197
222
|
# Passes every non-empty HTML comment to the given block.
|
198
223
|
#
|
199
224
|
# @yield [comment]
|
200
|
-
# The given block will be
|
225
|
+
# The given block will be pass every HTML comment.
|
201
226
|
#
|
202
227
|
# @yieldparam [String] comment
|
203
228
|
# The HTML comment inner text, with leading and trailing whitespace
|
204
229
|
# stripped.
|
205
230
|
#
|
231
|
+
# @example
|
232
|
+
# spider.every_html_comment do |comment|
|
233
|
+
# puts comment
|
234
|
+
# end
|
235
|
+
#
|
236
|
+
# @api public
|
237
|
+
#
|
206
238
|
def every_html_comment
|
207
239
|
every_html_page do |page|
|
208
240
|
page.doc.xpath('//comment()').each do |comment|
|
@@ -224,6 +256,13 @@ module Ronin
|
|
224
256
|
# @yieldparam [String] js
|
225
257
|
# The JavaScript source code.
|
226
258
|
#
|
259
|
+
# @example
|
260
|
+
# spider.every_javascript do |js|
|
261
|
+
# puts js
|
262
|
+
# end
|
263
|
+
#
|
264
|
+
# @api public
|
265
|
+
#
|
227
266
|
def every_javascript
|
228
267
|
# yield inner text of every `<script type="text/javascript">` tag
|
229
268
|
# and every `.js` URL.
|
@@ -252,6 +291,13 @@ module Ronin
|
|
252
291
|
# @yieldparam [String] string
|
253
292
|
# The parsed contents of a JavaScript string.
|
254
293
|
#
|
294
|
+
# @example
|
295
|
+
# spider.every_javascript_string do |str|
|
296
|
+
# puts str
|
297
|
+
# end
|
298
|
+
#
|
299
|
+
# @api public
|
300
|
+
#
|
255
301
|
def every_javascript_string
|
256
302
|
every_javascript do |js|
|
257
303
|
js.scan(Support::Text::Patterns::STRING) do |js_string|
|
@@ -271,6 +317,13 @@ module Ronin
|
|
271
317
|
# @yieldparam [String] comment
|
272
318
|
# The contents of a JavaScript comment.
|
273
319
|
#
|
320
|
+
# @example
|
321
|
+
# spider.every_javascript_comment do |comment|
|
322
|
+
# puts comment
|
323
|
+
# end
|
324
|
+
#
|
325
|
+
# @api public
|
326
|
+
#
|
274
327
|
def every_javascript_comment(&block)
|
275
328
|
every_javascript do |js|
|
276
329
|
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
|
@@ -288,9 +341,16 @@ module Ronin
|
|
288
341
|
# @yieldparam [String] comment
|
289
342
|
# The contents of a HTML or JavaScript comment.
|
290
343
|
#
|
344
|
+
# @example
|
345
|
+
# spider.every_comment do |comment|
|
346
|
+
# puts comment
|
347
|
+
# end
|
348
|
+
#
|
291
349
|
# @see #every_html_comment
|
292
350
|
# @see #every_javascript_comment
|
293
351
|
#
|
352
|
+
# @api public
|
353
|
+
#
|
294
354
|
def every_comment(&block)
|
295
355
|
every_html_comment(&block)
|
296
356
|
every_javascript_comment(&block)
|
@@ -29,6 +29,9 @@ module Ronin
|
|
29
29
|
#
|
30
30
|
# Spider a host and archive every web page:
|
31
31
|
#
|
32
|
+
# require 'ronin/web/spider'
|
33
|
+
# require 'ronin/web/spider/archive'
|
34
|
+
#
|
32
35
|
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
33
36
|
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
34
37
|
# archive.write(page.url,page.body)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -30,8 +30,8 @@ module Ronin
|
|
30
30
|
#
|
31
31
|
# Spider a host and archive every web page to a Git repository:
|
32
32
|
#
|
33
|
-
# require 'ronin/web/spider/git_archive'
|
34
33
|
# require 'ronin/web/spider'
|
34
|
+
# require 'ronin/web/spider/git_archive'
|
35
35
|
# require 'date'
|
36
36
|
#
|
37
37
|
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# ronin-web-spider - A collection of common web spidering routines.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2006-
|
4
|
+
# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU Lesser General Public License as published
|
@@ -21,7 +21,7 @@ module Ronin
|
|
21
21
|
module Web
|
22
22
|
module Spider
|
23
23
|
# ronin-web-spider version
|
24
|
-
VERSION = '0.1.0
|
24
|
+
VERSION = '0.1.0'
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|