ronin-web-spider 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'spidr/agent'
21
+
22
+ require 'ronin/support/network/http'
23
+ require 'ronin/support/crypto/cert'
24
+ require 'ronin/support/text/patterns/source_code'
25
+ require 'ronin/support/encoding/js'
26
+
27
+ module Ronin
28
+ module Web
29
+ module Spider
30
+ #
31
+ # Extends [Spidr::Agent](https://rubydoc.info/gems/spidr/Agent).
32
+ #
33
+ class Agent < Spidr::Agent
34
+
35
+ #
36
+ # Creates a new Spider object.
37
+ #
38
+ # @param [Spidr::Proxy, Addressable::URI, URI::HTTP, Hash, String, nil] proxy
39
+ # The proxy to use while spidering.
40
+ #
41
+ # @param [String, nil] user_agent
42
+ # The User-Agent string to send.
43
+ #
44
+ # @param [Hash{Symbol => Object}] kwargs
45
+ # Additional keyword arguments for `Spidr::Agent#initialize`.
46
+ #
47
+ # @option kwargs [String, nil] :referer
48
+ # The referer URL to send.
49
+ #
50
+ # @option kwargs [Integer] :delay (0)
51
+ # Duration in seconds to pause between spidering each link.
52
+ #
53
+ # @option kwargs [Array] :schemes (['http', 'https'])
54
+ # The list of acceptable URI schemes to visit.
55
+ # The `https` scheme will be ignored if `net/https` cannot be
56
+ # loaded.
57
+ #
58
+ # @option kwargs [String, nil] :host
59
+ # The host-name to visit.
60
+ #
61
+ # @option kwargs [Array<String, Regexp, Proc>] :hosts
62
+ # The patterns which match the host-names to visit.
63
+ #
64
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_hosts
65
+ # The patterns which match the host-names to not visit.
66
+ #
67
+ # @option kwargs [Array<Integer, Regexp, Proc>] :ports
68
+ # The patterns which match the ports to visit.
69
+ #
70
+ # @option kwargs [Array<Integer, Regexp, Proc>] :ignore_ports
71
+ # The patterns which match the ports to not visit.
72
+ #
73
+ # @option kwargs [Array<String, Regexp, Proc>] :links
74
+ # The patterns which match the links to visit.
75
+ #
76
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_links
77
+ # The patterns which match the links to not visit.
78
+ #
79
+ # @option kwargs [Array<String, Regexp, Proc>] :exts
80
+ # The patterns which match the URI path extensions to visit.
81
+ #
82
+ # @option kwargs [Array<String, Regexp, Proc>] :ignore_exts
83
+ # The patterns which match the URI path extensions to not visit.
84
+ #
85
+ # @yield [agent]
86
+ # If a block is given, it will be passed the newly created web spider
87
+ # agent.
88
+ #
89
+ # @yieldparam [Agent] agent
90
+ # The newly created web spider agent.
91
+ #
92
+ # @see https://rubydoc.info/gems/spidr/Spidr/Agent#initialize-instance_method
93
+ #
94
+ # @api public
95
+ #
96
+ def initialize(proxy: Support::Network::HTTP.proxy,
97
+ user_agent: Support::Network::HTTP.user_agent,
98
+ **kwargs,
99
+ &block)
100
+ proxy = case proxy
101
+ when Addressable::URI
102
+ Spidr::Proxy.new(
103
+ host: proxy.host,
104
+ port: proxy.port,
105
+ user: proxy.user,
106
+ password: proxy.password
107
+ )
108
+ else
109
+ proxy
110
+ end
111
+
112
+ user_agent = case user_agent
113
+ when Symbol
114
+ Support::Network::HTTP::UserAgents[user_agent]
115
+ else
116
+ user_agent
117
+ end
118
+
119
+ super(proxy: proxy, user_agent: user_agent, **kwargs,&block)
120
+ end
121
+
122
+ # The visited host names.
123
+ #
124
+ # @return [Set<String>, nil]
125
+ attr_reader :visited_hosts
126
+
127
+ #
128
+ # Passes every unique host name that the agent visits to the given
129
+ # block and populates {#visited_hosts}.
130
+ #
131
+ # @yield [host]
132
+ #
133
+ # @yieldparam [String] host
134
+ #
135
+ def every_host
136
+ @visited_hosts ||= Set.new
137
+
138
+ every_page do |page|
139
+ host = page.url.host
140
+
141
+ if @visited_hosts.add?(host)
142
+ yield host
143
+ end
144
+ end
145
+ end
146
+
147
+ # All certificates encountered while spidering.
148
+ #
149
+ # @return [Array<Ronin::Support::Crypto::Cert>]
150
+ attr_reader :collected_certs
151
+
152
+ #
153
+ # Passes every unique TLS certificate to the given block and populates
154
+ # {#collected_certs}.
155
+ #
156
+ # @yield [cert]
157
+ #
158
+ # @yieldparam [Ronin::Support::Crypto::Cert]
159
+ #
160
+ def every_cert
161
+ @collected_certs ||= []
162
+
163
+ serials = Set.new
164
+
165
+ every_page do |page|
166
+ if page.url.scheme == 'https'
167
+ cert = sessions[page.url].peer_cert
168
+
169
+ if serials.add?(cert.serial)
170
+ cert = Support::Crypto::Cert(cert)
171
+
172
+ @collected_certs << cert
173
+ yield cert
174
+ end
175
+ end
176
+ end
177
+ end
178
+
179
+ #
180
+ # Pass every favicon from every page to the given block.
181
+ #
182
+ # @yield [favicon]
183
+ # The given block will be passed every encountered `.ico` file.
184
+ #
185
+ # @yieldparam [Spidr::Page] favicon
186
+ # An encountered `.ico` file.
187
+ #
188
+ # @see https://rubydoc.info/gems/spidr/Spidr/Page
189
+ #
190
+ def every_favicon
191
+ every_page do |page|
192
+ yield page if page.icon?
193
+ end
194
+ end
195
+
196
+ #
197
+ # Passes every non-empty HTML comment to the given block.
198
+ #
199
+ # @yield [comment]
200
+ # The given block will be passevery HTML comment.
201
+ #
202
+ # @yieldparam [String] comment
203
+ # The HTML comment inner text, with leading and trailing whitespace
204
+ # stripped.
205
+ #
206
+ def every_html_comment
207
+ every_html_page do |page|
208
+ page.doc.xpath('//comment()').each do |comment|
209
+ comment_text = comment.inner_text.strip
210
+
211
+ unless comment_text.empty?
212
+ yield comment_text
213
+ end
214
+ end
215
+ end
216
+ end
217
+
218
+ #
219
+ # Passes every piece of JavaScript to the given block.
220
+ #
221
+ # @yield [js]
222
+ # The given block will be passed every piece of JavaScript source.
223
+ #
224
+ # @yieldparam [String] js
225
+ # The JavaScript source code.
226
+ #
227
+ def every_javascript
228
+ # yield inner text of every `<script type="text/javascript">` tag
229
+ # and every `.js` URL.
230
+ every_html_page do |page|
231
+ page.doc.xpath('//script[@type="text/javascript"]').each do |script|
232
+ unless script.inner_text.empty?
233
+ yield script.inner_text
234
+ end
235
+ end
236
+ end
237
+
238
+ every_javascript_page do |page|
239
+ yield page.body
240
+ end
241
+ end
242
+
243
+ alias every_js every_javascript
244
+
245
+ #
246
+ # Passes every JavaScript string value to the given block.
247
+ #
248
+ # @yield [string]
249
+ # The given block will be passed each JavaScript string with the quote
250
+ # marks removed.
251
+ #
252
+ # @yieldparam [String] string
253
+ # The parsed contents of a JavaScript string.
254
+ #
255
+ def every_javascript_string
256
+ every_javascript do |js|
257
+ js.scan(Support::Text::Patterns::STRING) do |js_string|
258
+ yield Support::Encoding::JS.unquote(js_string)
259
+ end
260
+ end
261
+ end
262
+
263
+ alias every_js_string every_javascript_string
264
+
265
+ #
266
+ # Passes every JavaScript comment to the given block.
267
+ #
268
+ # @yield [comment]
269
+ # The given block will be passed each JavaScript comment.
270
+ #
271
+ # @yieldparam [String] comment
272
+ # The contents of a JavaScript comment.
273
+ #
274
+ def every_javascript_comment(&block)
275
+ every_javascript do |js|
276
+ js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
277
+ end
278
+ end
279
+
280
+ alias every_js_comment every_javascript_comment
281
+
282
+ #
283
+ # Passes every HTML and JavaScript comment to the given block.
284
+ #
285
+ # @yield [comment]
286
+ # The given block will be passed each HTML or JavaScript comment.
287
+ #
288
+ # @yieldparam [String] comment
289
+ # The contents of a HTML or JavaScript comment.
290
+ #
291
+ # @see #every_html_comment
292
+ # @see #every_javascript_comment
293
+ #
294
+ def every_comment(&block)
295
+ every_html_comment(&block)
296
+ every_javascript_comment(&block)
297
+ end
298
+
299
+ end
300
+ end
301
+ end
302
+ end
@@ -0,0 +1,116 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'fileutils'
21
+
22
+ module Ronin
23
+ module Web
24
+ module Spider
25
+ #
26
+ # Represents a web archive directory.
27
+ #
28
+ # ## Example
29
+ #
30
+ # Spider a host and archive every web page:
31
+ #
32
+ # Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
33
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
34
+ # archive.write(page.url,page.body)
35
+ # end
36
+ # end
37
+ #
38
+ class Archive
39
+
40
+ # The path to the archive root directory.
41
+ #
42
+ # @return [String]
43
+ attr_reader :root
44
+
45
+ #
46
+ # Initializes the archive.
47
+ #
48
+ # @param [String] root
49
+ # The path to the root directory.
50
+ #
51
+ def initialize(root)
52
+ @root = File.expand_path(root)
53
+ end
54
+
55
+ #
56
+ # Creates the archive and the archive's directory, if it already does
57
+ # not exist.
58
+ #
59
+ # @param [String] root
60
+ # The path to the new archive.
61
+ #
62
+ # @yield [archive]
63
+ # If a block is given, it will be passed the newly created archive.
64
+ #
65
+ # @yieldparam [Archive] archive
66
+ # The newly created archive.
67
+ #
68
+ # @return [GitArchive]
69
+ # The newly created archive.
70
+ #
71
+ def self.open(root)
72
+ archive = new(root)
73
+
74
+ FileUtils.mkdir_p(archive.root)
75
+
76
+ yield archive if block_given?
77
+ return archive
78
+ end
79
+
80
+ #
81
+ # Archives a webpage.
82
+ #
83
+ # @param [URI::HTTP] url
84
+ # The URL of the response.
85
+ #
86
+ # @param [String] body
87
+ # The response body to save.
88
+ #
89
+ # @return [String]
90
+ # The full path to the archived page.
91
+ #
92
+ def write(url,body)
93
+ absolute_path = File.join(@root,url.request_uri[1..])
94
+ absolute_path << 'index.html' if absolute_path.end_with?('/')
95
+
96
+ parent_dir = File.dirname(absolute_path)
97
+
98
+ FileUtils.mkdir_p(parent_dir) unless File.directory?(parent_dir)
99
+ File.write(absolute_path,body)
100
+ return absolute_path
101
+ end
102
+
103
+ #
104
+ # Converts the archive to a String.
105
+ #
106
+ # @return [String]
107
+ # The path of the archive directory.
108
+ #
109
+ def to_s
110
+ @root
111
+ end
112
+
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,36 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ module Ronin
21
+ module Web
22
+ module Spider
23
+ #
24
+ # An exception class for when a `git` command fails.
25
+ #
26
+ class GitError < RuntimeError
27
+ end
28
+
29
+ #
30
+ # The exception class that represents when `git` is not installed.
31
+ #
32
+ class GitNotInstalled < GitError
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,194 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ require 'ronin/web/spider/archive'
21
+ require 'ronin/web/spider/exceptions'
22
+
23
+ module Ronin
24
+ module Web
25
+ module Spider
26
+ #
27
+ # Represents a web archive directory that is backed by Git.
28
+ #
29
+ # ## Example
30
+ #
31
+ # Spider a host and archive every web page to a Git repository:
32
+ #
33
+ # require 'ronin/web/spider/git_archive'
34
+ # require 'ronin/web/spider'
35
+ # require 'date'
36
+ #
37
+ # Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
38
+ # archive.commit("Updated #{Date.today}") do
39
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
40
+ # archive.write(page.url,page.body)
41
+ # end
42
+ # end
43
+ # end
44
+ #
45
+ class GitArchive < Archive
46
+
47
+ #
48
+ # Creates the Git archive, if it already does not exist.
49
+ #
50
+ # @param [String] root
51
+ # The path to the new Git archive.
52
+ #
53
+ # @yield [archive]
54
+ # If a block is given, it will be passed the newly created Git
55
+ # archive.
56
+ #
57
+ # @yieldparam [GitArchive] archive
58
+ # The newly created Git archive.
59
+ #
60
+ # @return [GitArchive]
61
+ # The newly created Git archive.
62
+ #
63
+ def self.open(root)
64
+ super(root) do |archive|
65
+ archive.init unless archive.git?
66
+
67
+ yield archive if block_given?
68
+ end
69
+ end
70
+
71
+ #
72
+ # Determines if the git repository has been initialized.
73
+ #
74
+ # @return [Boolean]
75
+ #
76
+ def git?
77
+ File.directory?(File.join(@root,'.git'))
78
+ end
79
+
80
+ #
81
+ # Initializes the Git repository.
82
+ #
83
+ # @return [true]
84
+ # Indicates the Git repository was successfully initialized.
85
+ #
86
+ # @raise [GitError]
87
+ # Indicates that the `git` command exited with an error.
88
+ #
89
+ # @raise [GitNotInstalled]
90
+ # Indicates that `git` was not installed or could not be found in the
91
+ # `$PATH` environment variable.
92
+ #
93
+ def init
94
+ git('init')
95
+ end
96
+
97
+ #
98
+ # Saves a webpage to the Git archive.
99
+ #
100
+ # @param [URI::HTTP] url
101
+ # The URL of the response.
102
+ #
103
+ # @param [String] body
104
+ # The response body to save.
105
+ #
106
+ # @return [String]
107
+ # The full path to the archived page.
108
+ #
109
+ # @raise [GitError]
110
+ # Indicates that the `git` command exited with an error.
111
+ #
112
+ # @raise [GitNotInstalled]
113
+ # Indicates that `git` was not installed or could not be found in the
114
+ # `$PATH` environment variable.
115
+ #
116
+ def write(url,body)
117
+ absolute_path = super(url,body)
118
+
119
+ git('add',absolute_path)
120
+ return absolute_path
121
+ end
122
+
123
+ #
124
+ # Commits changes to the Git archive.
125
+ #
126
+ # @param [String] message
127
+ # The commit message.
128
+ #
129
+ # @yield [self]
130
+ # If a block is given it will be called before committing any changes.
131
+ #
132
+ # @return [true]
133
+ # Indicates whether the changes were successfully committed.
134
+ #
135
+ # @raise [GitError]
136
+ # Indicates the `git` command exited with an error.
137
+ #
138
+ # @raise [GitNotInstalled]
139
+ # Indicates that `git` was not installed or could not be found in the
140
+ # `$PATH` environment variable.
141
+ #
142
+ # @example
143
+ # archive.write(url,response.body)
144
+ # archive.commit "Updated #{Date.today}"
145
+ #
146
+ # @example with a block:
147
+ # archive.commit("Updated #{Date.today}") do
148
+ # Ronin::Web::Spider.every_page(host: 'example.com') do |page|
149
+ # archive.write(page.url,page.body)
150
+ # end
151
+ # end
152
+ #
153
+ def commit(message)
154
+ yield self if block_given?
155
+
156
+ git('commit','-m',message.to_s)
157
+ end
158
+
159
+ private
160
+
161
+ #
162
+ # Executes a `git` command in the archive root directory..
163
+ #
164
+ # @param [Array<String>] args
165
+ # Additional arguments for the `git` command.
166
+ #
167
+ # @return [true]
168
+ # Indicates that the `git` command executed successfully.
169
+ #
170
+ # @raise [GitError]
171
+ # Indicates that the `git` command exited with an error.
172
+ #
173
+ # @raise [GitNotInstalled]
174
+ # Indicates that `git` was not installed or could not be found in the
175
+ # `$PATH` environment variable.
176
+ #
177
+ def git(*args)
178
+ command = ['git', '-C', @root]
179
+ command.concat(args)
180
+
181
+ case system(*command)
182
+ when false
183
+ raise(GitError,"git command failed: #{command.join(' ')}")
184
+ when nil
185
+ raise(GitNotInstalled,"the git command was not found")
186
+ else
187
+ true
188
+ end
189
+ end
190
+
191
+ end
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,27 @@
1
+ #
2
+ # ronin-web-spider - A collection of common web spidering routines.
3
+ #
4
+ # Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
5
+ #
6
+ # ronin-web-spider is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published
8
+ # by the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # ronin-web-spider is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public License
17
+ # along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
18
+ #
19
+
20
+ module Ronin
21
+ module Web
22
+ module Spider
23
+ # ronin-web-spider version
24
+ VERSION = '0.1.0.beta1'
25
+ end
26
+ end
27
+ end