ronin-web-spider 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,302 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'spidr/agent'
21
+
22
+ require 'ronin/support/network/http'
23
+ require 'ronin/support/crypto/cert'
24
+ require 'ronin/support/text/patterns/source_code'
25
+ require 'ronin/support/encoding/js'
26
+
27
+ module Ronin
28
+ module Web
29
+ module Spider
30
+ #
31
+ # Extends [Spidr::Agent](https://rubydoc.info/gems/spidr/Agent).
32
+ #
33
+ class Agent < Spidr::Agent
34
+
35
+ #
36
+ # Creates a new Spider object.
37
+ #
38
+ # @param [Spidr::Proxy, Addressable::URI, URI::HTTP, Hash, String, nil] proxy
39
+ # The proxy to use while spidering.
40
+ #
41
+ # @param [String, nil] user_agent
42
+ # The User-Agent string to send.
43
+ #
44
+ # @param [Hash{Symbol => Object}] kwargs
45
+ # Additional keyword arguments for `Spidr::Agent#initialize`.
46
+ #
47
+ # @option kwargs [String, nil] :referer
48
+ # The referer URL to send.
49
+ #
50
+ # @option kwargs [Integer] :delay (0)
51
+ # Duration in seconds to pause between spidering each link.
52
+ #
53
+ # @option kwargs [Array] :schemes (['http', 'https'])
54
+ # The list of acceptable URI schemes to visit.
55
+ # The `https` scheme will be ignored if `net/https` cannot be
56
+ # loaded.
57
+ #
58
+ # @option kwargs [String, nil] :host
59
+ # The host-name to visit.
60
+ #
61
+ # @option kwargs [Array<String, Regexp, Proc>] :hosts
62
+ # The patterns which match the host-names to visit.
63
+ #
64
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_hosts
65
+ # The patterns which match the host-names to not visit.
66
+ #
67
+ # @option kwargs [Array<Integer, Regexp, Proc>] :ports
68
+ # The patterns which match the ports to visit.
69
+ #
70
+ # @option kwargs [Array<Integer, Regexp, Proc>] :ignore_ports
71
+ # The patterns which match the ports to not visit.
72
+ #
73
+ # @option kwargs [Array<String, Regexp, Proc>] :links
74
+ # The patterns which match the links to visit.
75
+ #
76
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_links
77
+ # The patterns which match the links to not visit.
78
+ #
79
+ # @option kwargs [Array<String, Regexp, Proc>] :exts
80
+ # The patterns which match the URI path extensions to visit.
81
+ #
82
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_exts
83
+ # The patterns which match the URI path extensions to not visit.
84
+ #
85
+ # @yield [agent]
86
+ # If a block is given, it will be passed the newly created web spider
87
+ # agent.
88
+ #
89
+ # @yieldparam [Agent] agent
90
+ # The newly created web spider agent.
91
+ #
92
+ # @see https://rubydoc.info/gems/spidr/Spidr/Agent#initialize-instance_method
93
+ #
94
+ # @api public
95
+ #
96
+ def initialize(proxy: Support::Network::HTTP.proxy,
97
+ user_agent: Support::Network::HTTP.user_agent,
98
+ **kwargs,
99
+ &block)
100
+ proxy = case proxy
101
+ when Addressable::URI
102
+ Spidr::Proxy.new(
103
+ host: proxy.host,
104
+ port: proxy.port,
105
+ user: proxy.user,
106
+ password: proxy.password
107
+ )
108
+ else
109
+ proxy
110
+ end
111
+
112
+ user_agent = case user_agent
113
+ when Symbol
114
+ Support::Network::HTTP::UserAgents[user_agent]
115
+ else
116
+ user_agent
117
+ end
118
+
119
+ super(proxy: proxy, user_agent: user_agent, **kwargs,&block)
120
+ end
121
+
122
+ # The visited host names.
123
+ #
124
+ # @return [Set<String>, nil]
125
+ attr_reader :visited_hosts
126
+
127
+ #
128
+ # Passes every unique host name that the agent visits to the given
129
+ # block and populates {#visited_hosts}.
130
+ #
131
+ # @yield [host]
132
+ #
133
+ # @yieldparam [String] host
134
+ #
135
+ def every_host
136
+ @visited_hosts ||= Set.new
137
+
138
+ every_page do |page|
139
+ host = page.url.host
140
+
141
+ if @visited_hosts.add?(host)
142
+ yield host
143
+ end
144
+ end
145
+ end
146
+
147
+ # All certificates encountered while spidering.
148
+ #
149
+ # @return [Array<Ronin::Support::Crypto::Cert>]
150
+ attr_reader :collected_certs
151
+
152
+ #
153
+ # Passes every unique TLS certificate to the given block and populates
154
+ # {#collected_certs}.
155
+ #
156
+ # @yield [cert]
157
+ #
158
+ # @yieldparam [Ronin::Support::Crypto::Cert]
159
+ #
160
+ def every_cert
161
+ @collected_certs ||= []
162
+
163
+ serials = Set.new
164
+
165
+ every_page do |page|
166
+ if page.url.scheme == 'https'
167
+ cert = sessions[page.url].peer_cert
168
+
169
+ if serials.add?(cert.serial)
170
+ cert = Support::Crypto::Cert(cert)
171
+
172
+ @collected_certs << cert
173
+ yield cert
174
+ end
175
+ end
176
+ end
177
+ end
178
+
179
+ #
180
+ # Pass every favicon from every page to the given block.
181
+ #
182
+ # @yield [favicon]
183
+ # The given block will be passed every encountered `.ico` file.
184
+ #
185
+ # @yieldparam [Spidr::Page] favicon
186
+ # An encountered `.ico` file.
187
+ #
188
+ # @see https://rubydoc.info/gems/spidr/Spidr/Page
189
+ #
190
+ def every_favicon
191
+ every_page do |page|
192
+ yield page if page.icon?
193
+ end
194
+ end
195
+
196
+ #
197
+ # Passes every non-empty HTML comment to the given block.
198
+ #
199
+ # @yield [comment]
200
+ # The given block will be passevery HTML comment.
201
+ #
202
+ # @yieldparam [String] comment
203
+ # The HTML comment inner text, with leading and trailing whitespace
204
+ # stripped.
205
+ #
206
+ def every_html_comment
207
+ every_html_page do |page|
208
+ page.doc.xpath('//comment()').each do |comment|
209
+ comment_text = comment.inner_text.strip
210
+
211
+ unless comment_text.empty?
212
+ yield comment_text
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ #
219
+ # Passes every piece of JavaScript to the given block.
220
+ #
221
+ # @yield [js]
222
+ # The given block will be passed every piece of JavaScript source.
223
+ #
224
+ # @yieldparam [String] js
225
+ # The JavaScript source code.
226
+ #
227
+ def every_javascript
228
+ # yield inner text of every `<script type="text/javascript">` tag
229
+ # and every `.js` URL.
230
+ every_html_page do |page|
231
+ page.doc.xpath('//script[@type="text/javascript"]').each do |script|
232
+ unless script.inner_text.empty?
233
+ yield script.inner_text
234
+ end
235
+ end
236
+ end
237
+
238
+ every_javascript_page do |page|
239
+ yield page.body
240
+ end
241
+ end
242
+
243
+ alias every_js every_javascript
244
+
245
+ #
246
+ # Passes every JavaScript string value to the given block.
247
+ #
248
+ # @yield [string]
249
+ # The given block will be passed each JavaScript string with the quote
250
+ # marks removed.
251
+ #
252
+ # @yieldparam [String] string
253
+ # The parsed contents of a JavaScript string.
254
+ #
255
+ def every_javascript_string
256
+ every_javascript do |js|
257
+ js.scan(Support::Text::Patterns::STRING) do |js_string|
258
+ yield Support::Encoding::JS.unquote(js_string)
259
+ end
260
+ end
261
+ end
262
+
263
+ alias every_js_string every_javascript_string
264
+
265
+ #
266
+ # Passes every JavaScript comment to the given block.
267
+ #
268
+ # @yield [comment]
269
+ # The given block will be passed each JavaScript comment.
270
+ #
271
+ # @yieldparam [String] comment
272
+ # The contents of a JavaScript comment.
273
+ #
274
+ def every_javascript_comment(&block)
275
+ every_javascript do |js|
276
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
277
+ end
278
+ end
279
+
280
+ alias every_js_comment every_javascript_comment
281
+
282
+ #
283
+ # Passes every HTML and JavaScript comment to the given block.
284
+ #
285
+ # @yield [comment]
286
+ # The given block will be passed each HTML or JavaScript comment.
287
+ #
288
+ # @yieldparam [String] comment
289
+ # The contents of a HTML or JavaScript comment.
290
+ #
291
+ # @see #every_html_comment
292
+ # @see #every_javascript_comment
293
+ #
294
+ def every_comment(&block)
295
+ every_html_comment(&block)
296
+ every_javascript_comment(&block)
297
+ end
298
+
299
+ end
300
+ end
301
+ end
302
+ end
@@ -0,0 +1,116 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'fileutils'
21
+
22
+ module Ronin
23
+ module Web
24
+ module Spider
25
+ #
26
+ # Represents a web archive directory.
27
+ #
28
+ # ## Example
29
+ #
30
+ # Spider a host and archive every web page:
31
+ #
32
+ # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
33
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
34
+ # archive.write(page.url,page.body)
35
+ # end
36
+ # end
37
+ #
38
+ class Archive
39
+
40
+ # The path to the archive root directory.
41
+ #
42
+ # @return [String]
43
+ attr_reader :root
44
+
45
+ #
46
+ # Initializes the archive.
47
+ #
48
+ # @param [String] root
49
+ # The path to the root directory.
50
+ #
51
+ def initialize(root)
52
+ @root = File.expand_path(root)
53
+ end
54
+
55
+ #
56
+ # Creates the archive and the archive's directory, if it already does
57
+ # not exist.
58
+ #
59
+ # @param [String] root
60
+ # The path to the new archive.
61
+ #
62
+ # @yield [archive]
63
+ # If a block is given, it will be passed the newly created archive.
64
+ #
65
+ # @yieldparam [Archive] archive
66
+ # The newly created archive.
67
+ #
68
+ # @return [GitArchive]
69
+ # The newly created archive.
70
+ #
71
+ def self.open(root)
72
+ archive = new(root)
73
+
74
+ FileUtils.mkdir_p(archive.root)
75
+
76
+ yield archive if block_given?
77
+ return archive
78
+ end
79
+
80
+ #
81
+ # Archives a webpage.
82
+ #
83
+ # @param [URI::HTTP] url
84
+ # The URL of the response.
85
+ #
86
+ # @param [String] body
87
+ # The response body to save.
88
+ #
89
+ # @return [String]
90
+ # The full path to the archived page.
91
+ #
92
+ def write(url,body)
93
+ absolute_path = File.join(@root,url.request_uri[1..])
94
+ absolute_path << 'index.html' if absolute_path.end_with?('/')
95
+
96
+ parent_dir = File.dirname(absolute_path)
97
+
98
+ FileUtils.mkdir_p(parent_dir) unless File.directory?(parent_dir)
99
+ File.write(absolute_path,body)
100
+ return absolute_path
101
+ end
102
+
103
+ #
104
+ # Converts the archive to a String.
105
+ #
106
+ # @return [String]
107
+ # The path of the archive directory.
108
+ #
109
+ def to_s
110
+ @root
111
+ end
112
+
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,36 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ module Ronin
21
+ module Web
22
+ module Spider
23
+ #
24
+ # An exception class for when a `git` command fails.
25
+ #
26
+ class GitError < RuntimeError
27
+ end
28
+
29
+ #
30
+ # The exception class that represents when `git` is not installed.
31
+ #
32
+ class GitNotInstalled < GitError
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,194 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'ronin/web/spider/archive'
21
+ require 'ronin/web/spider/exceptions'
22
+
23
+ module Ronin
24
+ module Web
25
+ module Spider
26
+ #
27
+ # Represents a web archive directory that is backed by Git.
28
+ #
29
+ # ## Example
30
+ #
31
+ # Spider a host and archive every web page to a Git repository:
32
+ #
33
+ # require 'ronin/web/spider/git_archive'
34
+ # require 'ronin/web/spider'
35
+ # require 'date'
36
+ #
37
+ # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
+ # archive.commit("Updated #{Date.today}") do
39
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
40
+ # archive.write(page.url,page.body)
41
+ # end
42
+ # end
43
+ # end
44
+ #
45
+ class GitArchive < Archive
46
+
47
+ #
48
+ # Creates the Git archive, if it already does not exist.
49
+ #
50
+ # @param [String] root
51
+ # The path to the new Git archive.
52
+ #
53
+ # @yield [archive]
54
+ # If a block is given, it will be passed the newly created Git
55
+ # archive.
56
+ #
57
+ # @yieldparam [GitArchive] archive
58
+ # The newly created Git archive.
59
+ #
60
+ # @return [GitArchive]
61
+ # The newly created Git archive.
62
+ #
63
+ def self.open(root)
64
+ super(root) do |archive|
65
+ archive.init unless archive.git?
66
+
67
+ yield archive if block_given?
68
+ end
69
+ end
70
+
71
+ #
72
+ # Determines if the git repository has been initialized.
73
+ #
74
+ # @return [Boolean]
75
+ #
76
+ def git?
77
+ File.directory?(File.join(@root,'.git'))
78
+ end
79
+
80
+ #
81
+ # Initializes the Git repository.
82
+ #
83
+ # @return [true]
84
+ # Indicates the Git repository was successfully initialized.
85
+ #
86
+ # @raise [GitError]
87
+ # Indicates that the `git` command exited with an error.
88
+ #
89
+ # @raise [GitNotInstalled]
90
+ # Indicates that `git` was not installed or could not be found in the
91
+ # `$PATH` environment variable.
92
+ #
93
+ def init
94
+ git('init')
95
+ end
96
+
97
+ #
98
+ # Saves a webpage to the Git archive.
99
+ #
100
+ # @param [URI::HTTP] url
101
+ # The URL of the response.
102
+ #
103
+ # @param [String] body
104
+ # The response body to save.
105
+ #
106
+ # @return [String]
107
+ # The full path to the archived page.
108
+ #
109
+ # @raise [GitError]
110
+ # Indicates that the `git` command exited with an error.
111
+ #
112
+ # @raise [GitNotInstalled]
113
+ # Indicates that `git` was not installed or could not be found in the
114
+ # `$PATH` environment variable.
115
+ #
116
+ def write(url,body)
117
+ absolute_path = super(url,body)
118
+
119
+ git('add',absolute_path)
120
+ return absolute_path
121
+ end
122
+
123
+ #
124
+ # Commits changes to the Git archive.
125
+ #
126
+ # @param [String] message
127
+ # The commit message.
128
+ #
129
+ # @yield [self]
130
+ # If a block is given it will be called before committing any changes.
131
+ #
132
+ # @return [true]
133
+ # Indicates whether the changes were successfully committed.
134
+ #
135
+ # @raise [GitError]
136
+ # Indicates the `git` command exited with an error.
137
+ #
138
+ # @raise [GitNotInstalled]
139
+ # Indicates that `git` was not installed or could not be found in the
140
+ # `$PATH` environment variable.
141
+ #
142
+ # @example
143
+ # archive.write(url,response.body)
144
+ # archive.commit "Updated #{Date.today}"
145
+ #
146
+ # @example with a block:
147
+ # archive.commit("Updated #{Date.today}") do
148
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
149
+ # archive.write(page.url,page.body)
150
+ # end
151
+ # end
152
+ #
153
+ def commit(message)
154
+ yield self if block_given?
155
+
156
+ git('commit','-m',message.to_s)
157
+ end
158
+
159
+ private
160
+
161
+ #
162
+ # Executes a `git` command in the archive root directory..
163
+ #
164
+ # @param [Array<String>] args
165
+ # Additional arguments for the `git` command.
166
+ #
167
+ # @return [true]
168
+ # Indicates that the `git` command executed successfully.
169
+ #
170
+ # @raise [GitError]
171
+ # Indicates that the `git` command exited with an error.
172
+ #
173
+ # @raise [GitNotInstalled]
174
+ # Indicates that `git` was not installed or could not be found in the
175
+ # `$PATH` environment variable.
176
+ #
177
+ def git(*args)
178
+ command = ['git', '-C', @root]
179
+ command.concat(args)
180
+
181
+ case system(*command)
182
+ when false
183
+ raise(GitError,"git command failed: #{command.join(' ')}")
184
+ when nil
185
+ raise(GitNotInstalled,"the git command was not found")
186
+ else
187
+ true
188
+ end
189
+ end
190
+
191
+ end
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,27 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ module Ronin
21
+ module Web
22
+ module Spider
23
+ # ronin-web-spider version
24
+ VERSION = '0.1.0.beta1'
25
+ end
26
+ end
27
+ end