ronin-web-spider 0.1.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.github/workflows/ruby.yml +31 -0
- data/.gitignore +13 -0
- data/.rspec +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/COPYING.txt +165 -0
- data/ChangeLog.md +19 -0
- data/Gemfile +31 -0
- data/README.md +139 -0
- data/Rakefile +31 -0
- data/gemspec.yml +27 -0
- data/lib/ronin/web/spider/agent.rb +302 -0
- data/lib/ronin/web/spider/archive.rb +116 -0
- data/lib/ronin/web/spider/exceptions.rb +36 -0
- data/lib/ronin/web/spider/git_archive.rb +194 -0
- data/lib/ronin/web/spider/version.rb +27 -0
- data/lib/ronin/web/spider.rb +115 -0
- data/ronin-web-spider.gemspec +61 -0
- data/spec/agent_spec.rb +585 -0
- data/spec/archive_spec.rb +91 -0
- data/spec/example_app.rb +27 -0
- data/spec/git_archive_spec.rb +137 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spider_spec.rb +252 -0
- metadata +122 -0
@@ -0,0 +1,302 @@
|
|
1
|
+
#
|
2
|
+
# ronin-web-spider - A collection of common web spidering routines.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published
|
8
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# ronin-web-spider is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public License
|
17
|
+
# along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
|
20
|
+
require 'spidr/agent'
|
21
|
+
|
22
|
+
require 'ronin/support/network/http'
|
23
|
+
require 'ronin/support/crypto/cert'
|
24
|
+
require 'ronin/support/text/patterns/source_code'
|
25
|
+
require 'ronin/support/encoding/js'
|
26
|
+
|
27
|
+
module Ronin
|
28
|
+
module Web
|
29
|
+
module Spider
|
30
|
+
#
|
31
|
+
# Extends [Spidr::Agent](https://rubydoc.info/gems/spidr/Agent).
|
32
|
+
#
|
33
|
+
class Agent < Spidr::Agent
|
34
|
+
|
35
|
+
#
|
36
|
+
# Creates a new Spider object.
|
37
|
+
#
|
38
|
+
# @param [Spidr::Proxy, Addressable::URI, URI::HTTP, Hash, String, nil] proxy
|
39
|
+
# The proxy to use while spidering.
|
40
|
+
#
|
41
|
+
# @param [String, nil] user_agent
|
42
|
+
# The User-Agent string to send.
|
43
|
+
#
|
44
|
+
# @param [Hash{Symbol => Object}] kwargs
|
45
|
+
# Additional keyword arguments for `Spidr::Agent#initialize`.
|
46
|
+
#
|
47
|
+
# @option kwargs [String, nil] :referer
|
48
|
+
# The referer URL to send.
|
49
|
+
#
|
50
|
+
# @option kwargs [Integer] :delay (0)
|
51
|
+
# Duration in seconds to pause between spidering each link.
|
52
|
+
#
|
53
|
+
# @option kwargs [Array] :schemes (['http', 'https'])
|
54
|
+
# The list of acceptable URI schemes to visit.
|
55
|
+
# The `https` scheme will be ignored if `net/https` cannot be
|
56
|
+
# loaded.
|
57
|
+
#
|
58
|
+
# @option kwargs [String, nil] :host
|
59
|
+
# The host-name to visit.
|
60
|
+
#
|
61
|
+
# @option kwargs [Array<String, Regexp, Proc>] :hosts
|
62
|
+
# The patterns which match the host-names to visit.
|
63
|
+
#
|
64
|
+
# @option kwargs [Array<String, Regexp, Proc>] :ignore_hosts
|
65
|
+
# The patterns which match the host-names to not visit.
|
66
|
+
#
|
67
|
+
# @option kwargs [Array<Integer, Regexp, Proc>] :ports
|
68
|
+
# The patterns which match the ports to visit.
|
69
|
+
#
|
70
|
+
# @option kwargs [Array<Integer, Regexp, Proc>] :ignore_ports
|
71
|
+
# The patterns which match the ports to not visit.
|
72
|
+
#
|
73
|
+
# @option kwargs [Array<String, Regexp, Proc>] :links
|
74
|
+
# The patterns which match the links to visit.
|
75
|
+
#
|
76
|
+
# @option kwargs [Array<String, Regexp, Proc>] :ignore_links
|
77
|
+
# The patterns which match the links to not visit.
|
78
|
+
#
|
79
|
+
# @option kwargs [Array<String, Regexp, Proc>] :exts
|
80
|
+
# The patterns which match the URI path extensions to visit.
|
81
|
+
#
|
82
|
+
# @option kwargs [Array<String, Regexp, Proc>] :ignore_exts
|
83
|
+
# The patterns which match the URI path extensions to not visit.
|
84
|
+
#
|
85
|
+
# @yield [agent]
|
86
|
+
# If a block is given, it will be passed the newly created web spider
|
87
|
+
# agent.
|
88
|
+
#
|
89
|
+
# @yieldparam [Agent] agent
|
90
|
+
# The newly created web spider agent.
|
91
|
+
#
|
92
|
+
# @see https://rubydoc.info/gems/spidr/Spidr/Agent#initialize-instance_method
|
93
|
+
#
|
94
|
+
# @api public
|
95
|
+
#
|
96
|
+
def initialize(proxy: Support::Network::HTTP.proxy,
|
97
|
+
user_agent: Support::Network::HTTP.user_agent,
|
98
|
+
**kwargs,
|
99
|
+
&block)
|
100
|
+
proxy = case proxy
|
101
|
+
when Addressable::URI
|
102
|
+
Spidr::Proxy.new(
|
103
|
+
host: proxy.host,
|
104
|
+
port: proxy.port,
|
105
|
+
user: proxy.user,
|
106
|
+
password: proxy.password
|
107
|
+
)
|
108
|
+
else
|
109
|
+
proxy
|
110
|
+
end
|
111
|
+
|
112
|
+
user_agent = case user_agent
|
113
|
+
when Symbol
|
114
|
+
Support::Network::HTTP::UserAgents[user_agent]
|
115
|
+
else
|
116
|
+
user_agent
|
117
|
+
end
|
118
|
+
|
119
|
+
super(proxy: proxy, user_agent: user_agent, **kwargs,&block)
|
120
|
+
end
|
121
|
+
|
122
|
+
# The visited host names.
|
123
|
+
#
|
124
|
+
# @return [Set<String>, nil]
|
125
|
+
attr_reader :visited_hosts
|
126
|
+
|
127
|
+
#
|
128
|
+
# Passes every unique host name that the agent visits to the given
|
129
|
+
# block and populates {#visited_hosts}.
|
130
|
+
#
|
131
|
+
# @yield [host]
|
132
|
+
#
|
133
|
+
# @yieldparam [String] host
|
134
|
+
#
|
135
|
+
def every_host
|
136
|
+
@visited_hosts ||= Set.new
|
137
|
+
|
138
|
+
every_page do |page|
|
139
|
+
host = page.url.host
|
140
|
+
|
141
|
+
if @visited_hosts.add?(host)
|
142
|
+
yield host
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# All certificates encountered while spidering.
|
148
|
+
#
|
149
|
+
# @return [Array<Ronin::Support::Crypto::Cert>]
|
150
|
+
attr_reader :collected_certs
|
151
|
+
|
152
|
+
#
|
153
|
+
# Passes every unique TLS certificate to the given block and populates
|
154
|
+
# {#collected_certs}.
|
155
|
+
#
|
156
|
+
# @yield [cert]
|
157
|
+
#
|
158
|
+
# @yieldparam [Ronin::Support::Crypto::Cert]
|
159
|
+
#
|
160
|
+
def every_cert
|
161
|
+
@collected_certs ||= []
|
162
|
+
|
163
|
+
serials = Set.new
|
164
|
+
|
165
|
+
every_page do |page|
|
166
|
+
if page.url.scheme == 'https'
|
167
|
+
cert = sessions[page.url].peer_cert
|
168
|
+
|
169
|
+
if serials.add?(cert.serial)
|
170
|
+
cert = Support::Crypto::Cert(cert)
|
171
|
+
|
172
|
+
@collected_certs << cert
|
173
|
+
yield cert
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# Pass every favicon from every page to the given block.
|
181
|
+
#
|
182
|
+
# @yield [favicon]
|
183
|
+
# The given block will be passed every encountered `.ico` file.
|
184
|
+
#
|
185
|
+
# @yieldparam [Spidr::Page] favicon
|
186
|
+
# An encountered `.ico` file.
|
187
|
+
#
|
188
|
+
# @see https://rubydoc.info/gems/spidr/Spidr/Page
|
189
|
+
#
|
190
|
+
def every_favicon
|
191
|
+
every_page do |page|
|
192
|
+
yield page if page.icon?
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
#
|
197
|
+
# Passes every non-empty HTML comment to the given block.
|
198
|
+
#
|
199
|
+
# @yield [comment]
|
200
|
+
# The given block will be passevery HTML comment.
|
201
|
+
#
|
202
|
+
# @yieldparam [String] comment
|
203
|
+
# The HTML comment inner text, with leading and trailing whitespace
|
204
|
+
# stripped.
|
205
|
+
#
|
206
|
+
def every_html_comment
|
207
|
+
every_html_page do |page|
|
208
|
+
page.doc.xpath('//comment()').each do |comment|
|
209
|
+
comment_text = comment.inner_text.strip
|
210
|
+
|
211
|
+
unless comment_text.empty?
|
212
|
+
yield comment_text
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
#
|
219
|
+
# Passes every piece of JavaScript to the given block.
|
220
|
+
#
|
221
|
+
# @yield [js]
|
222
|
+
# The given block will be passed every piece of JavaScript source.
|
223
|
+
#
|
224
|
+
# @yieldparam [String] js
|
225
|
+
# The JavaScript source code.
|
226
|
+
#
|
227
|
+
def every_javascript
|
228
|
+
# yield inner text of every `<script type="text/javascript">` tag
|
229
|
+
# and every `.js` URL.
|
230
|
+
every_html_page do |page|
|
231
|
+
page.doc.xpath('//script[@type="text/javascript"]').each do |script|
|
232
|
+
unless script.inner_text.empty?
|
233
|
+
yield script.inner_text
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
every_javascript_page do |page|
|
239
|
+
yield page.body
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
alias every_js every_javascript
|
244
|
+
|
245
|
+
#
|
246
|
+
# Passes every JavaScript string value to the given block.
|
247
|
+
#
|
248
|
+
# @yield [string]
|
249
|
+
# The given block will be passed each JavaScript string with the quote
|
250
|
+
# marks removed.
|
251
|
+
#
|
252
|
+
# @yieldparam [String] string
|
253
|
+
# The parsed contents of a JavaScript string.
|
254
|
+
#
|
255
|
+
def every_javascript_string
|
256
|
+
every_javascript do |js|
|
257
|
+
js.scan(Support::Text::Patterns::STRING) do |js_string|
|
258
|
+
yield Support::Encoding::JS.unquote(js_string)
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
alias every_js_string every_javascript_string
|
264
|
+
|
265
|
+
#
|
266
|
+
# Passes every JavaScript comment to the given block.
|
267
|
+
#
|
268
|
+
# @yield [comment]
|
269
|
+
# The given block will be passed each JavaScript comment.
|
270
|
+
#
|
271
|
+
# @yieldparam [String] comment
|
272
|
+
# The contents of a JavaScript comment.
|
273
|
+
#
|
274
|
+
def every_javascript_comment(&block)
|
275
|
+
every_javascript do |js|
|
276
|
+
js.scan(Support::Text::Patterns::JAVASCRIPT_COMMENT,&block)
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
alias every_js_comment every_javascript_comment
|
281
|
+
|
282
|
+
#
|
283
|
+
# Passes every HTML and JavaScript comment to the given block.
|
284
|
+
#
|
285
|
+
# @yield [comment]
|
286
|
+
# The given block will be passed each HTML or JavaScript comment.
|
287
|
+
#
|
288
|
+
# @yieldparam [String] comment
|
289
|
+
# The contents of a HTML or JavaScript comment.
|
290
|
+
#
|
291
|
+
# @see #every_html_comment
|
292
|
+
# @see #every_javascript_comment
|
293
|
+
#
|
294
|
+
def every_comment(&block)
|
295
|
+
every_html_comment(&block)
|
296
|
+
every_javascript_comment(&block)
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
#
|
2
|
+
# ronin-web-spider - A collection of common web spidering routines.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published
|
8
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# ronin-web-spider is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public License
|
17
|
+
# along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
|
20
|
+
require 'fileutils'
|
21
|
+
|
22
|
+
module Ronin
|
23
|
+
module Web
|
24
|
+
module Spider
|
25
|
+
#
|
26
|
+
# Represents a web archive directory.
|
27
|
+
#
|
28
|
+
# ## Example
|
29
|
+
#
|
30
|
+
# Spider a host and archive every web page:
|
31
|
+
#
|
32
|
+
# Ronin::Web::Spider::Archive.open('path/to/root') do |archive|
|
33
|
+
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
34
|
+
# archive.write(page.url,page.body)
|
35
|
+
# end
|
36
|
+
# end
|
37
|
+
#
|
38
|
+
class Archive
|
39
|
+
|
40
|
+
# The path to the archive root directory.
|
41
|
+
#
|
42
|
+
# @return [String]
|
43
|
+
attr_reader :root
|
44
|
+
|
45
|
+
#
|
46
|
+
# Initializes the archive.
|
47
|
+
#
|
48
|
+
# @param [String] root
|
49
|
+
# The path to the root directory.
|
50
|
+
#
|
51
|
+
def initialize(root)
|
52
|
+
@root = File.expand_path(root)
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# Creates the archive and the archive's directory, if it already does
|
57
|
+
# not exist.
|
58
|
+
#
|
59
|
+
# @param [String] root
|
60
|
+
# The path to the new archive.
|
61
|
+
#
|
62
|
+
# @yield [archive]
|
63
|
+
# If a block is given, it will be passed the newly created archive.
|
64
|
+
#
|
65
|
+
# @yieldparam [Archive] archive
|
66
|
+
# The newly created archive.
|
67
|
+
#
|
68
|
+
# @return [GitArchive]
|
69
|
+
# The newly created archive.
|
70
|
+
#
|
71
|
+
def self.open(root)
|
72
|
+
archive = new(root)
|
73
|
+
|
74
|
+
FileUtils.mkdir_p(archive.root)
|
75
|
+
|
76
|
+
yield archive if block_given?
|
77
|
+
return archive
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Archives a webpage.
|
82
|
+
#
|
83
|
+
# @param [URI::HTTP] url
|
84
|
+
# The URL of the response.
|
85
|
+
#
|
86
|
+
# @param [String] body
|
87
|
+
# The response body to save.
|
88
|
+
#
|
89
|
+
# @return [String]
|
90
|
+
# The full path to the archived page.
|
91
|
+
#
|
92
|
+
def write(url,body)
|
93
|
+
absolute_path = File.join(@root,url.request_uri[1..])
|
94
|
+
absolute_path << 'index.html' if absolute_path.end_with?('/')
|
95
|
+
|
96
|
+
parent_dir = File.dirname(absolute_path)
|
97
|
+
|
98
|
+
FileUtils.mkdir_p(parent_dir) unless File.directory?(parent_dir)
|
99
|
+
File.write(absolute_path,body)
|
100
|
+
return absolute_path
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Converts the archive to a String.
|
105
|
+
#
|
106
|
+
# @return [String]
|
107
|
+
# The path of the archive directory.
|
108
|
+
#
|
109
|
+
def to_s
|
110
|
+
@root
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#
|
2
|
+
# ronin-web-spider - A collection of common web spidering routines.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published
|
8
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# ronin-web-spider is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public License
|
17
|
+
# along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
|
20
|
+
module Ronin
|
21
|
+
module Web
|
22
|
+
module Spider
|
23
|
+
#
|
24
|
+
# An exception class for when a `git` command fails.
|
25
|
+
#
|
26
|
+
class GitError < RuntimeError
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# The exception class that represents when `git` is not installed.
|
31
|
+
#
|
32
|
+
class GitNotInstalled < GitError
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#
|
2
|
+
# ronin-web-spider - A collection of common web spidering routines.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2022 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published
|
8
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# ronin-web-spider is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public License
|
17
|
+
# along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
|
20
|
+
require 'ronin/web/spider/archive'
|
21
|
+
require 'ronin/web/spider/exceptions'
|
22
|
+
|
23
|
+
module Ronin
|
24
|
+
module Web
|
25
|
+
module Spider
|
26
|
+
#
|
27
|
+
# Represents a web archive directory that is backed by Git.
|
28
|
+
#
|
29
|
+
# ## Example
|
30
|
+
#
|
31
|
+
# Spider a host and archive every web page to a Git repository:
|
32
|
+
#
|
33
|
+
# require 'ronin/web/spider/git_archive'
|
34
|
+
# require 'ronin/web/spider'
|
35
|
+
# require 'date'
|
36
|
+
#
|
37
|
+
# Ronin::Web::Spider::GitArchive.open('path/to/root') do |archive|
|
38
|
+
# archive.commit("Updated #{Date.today}") do
|
39
|
+
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
40
|
+
# archive.write(page.url,page.body)
|
41
|
+
# end
|
42
|
+
# end
|
43
|
+
# end
|
44
|
+
#
|
45
|
+
class GitArchive < Archive
|
46
|
+
|
47
|
+
#
|
48
|
+
# Creates the Git archive, if it already does not exist.
|
49
|
+
#
|
50
|
+
# @param [String] root
|
51
|
+
# The path to the new Git archive.
|
52
|
+
#
|
53
|
+
# @yield [archive]
|
54
|
+
# If a block is given, it will be passed the newly created Git
|
55
|
+
# archive.
|
56
|
+
#
|
57
|
+
# @yieldparam [GitArchive] archive
|
58
|
+
# The newly created Git archive.
|
59
|
+
#
|
60
|
+
# @return [GitArchive]
|
61
|
+
# The newly created Git archive.
|
62
|
+
#
|
63
|
+
def self.open(root)
|
64
|
+
super(root) do |archive|
|
65
|
+
archive.init unless archive.git?
|
66
|
+
|
67
|
+
yield archive if block_given?
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Determines if the git repository has been initialized.
|
73
|
+
#
|
74
|
+
# @return [Boolean]
|
75
|
+
#
|
76
|
+
def git?
|
77
|
+
File.directory?(File.join(@root,'.git'))
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Initializes the Git repository.
|
82
|
+
#
|
83
|
+
# @return [true]
|
84
|
+
# Indicates the Git repository was successfully initialized.
|
85
|
+
#
|
86
|
+
# @raise [GitError]
|
87
|
+
# Indicates that the `git` command exited with an error.
|
88
|
+
#
|
89
|
+
# @raise [GitNotInstalled]
|
90
|
+
# Indicates that `git` was not installed or could not be found in the
|
91
|
+
# `$PATH` environment variable.
|
92
|
+
#
|
93
|
+
def init
|
94
|
+
git('init')
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Saves a webpage to the Git archive.
|
99
|
+
#
|
100
|
+
# @param [URI::HTTP] url
|
101
|
+
# The URL of the response.
|
102
|
+
#
|
103
|
+
# @param [String] body
|
104
|
+
# The response body to save.
|
105
|
+
#
|
106
|
+
# @return [String]
|
107
|
+
# The full path to the archived page.
|
108
|
+
#
|
109
|
+
# @raise [GitError]
|
110
|
+
# Indicates that the `git` command exited with an error.
|
111
|
+
#
|
112
|
+
# @raise [GitNotInstalled]
|
113
|
+
# Indicates that `git` was not installed or could not be found in the
|
114
|
+
# `$PATH` environment variable.
|
115
|
+
#
|
116
|
+
def write(url,body)
|
117
|
+
absolute_path = super(url,body)
|
118
|
+
|
119
|
+
git('add',absolute_path)
|
120
|
+
return absolute_path
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Commits changes to the Git archive.
|
125
|
+
#
|
126
|
+
# @param [String] message
|
127
|
+
# The commit message.
|
128
|
+
#
|
129
|
+
# @yield [self]
|
130
|
+
# If a block is given it will be called before committing any changes.
|
131
|
+
#
|
132
|
+
# @return [true]
|
133
|
+
# Indicates whether the changes were successfully committed.
|
134
|
+
#
|
135
|
+
# @raise [GitError]
|
136
|
+
# Indicates the `git` command exited with an error.
|
137
|
+
#
|
138
|
+
# @raise [GitNotInstalled]
|
139
|
+
# Indicates that `git` was not installed or could not be found in the
|
140
|
+
# `$PATH` environment variable.
|
141
|
+
#
|
142
|
+
# @example
|
143
|
+
# archive.write(url,response.body)
|
144
|
+
# archive.commit "Updated #{Date.today}"
|
145
|
+
#
|
146
|
+
# @example with a block:
|
147
|
+
# archive.commit("Updated #{Date.today}") do
|
148
|
+
# Ronin::Web::Spider.every_page(host: 'example.com') do |page|
|
149
|
+
# archive.write(page.url,page.body)
|
150
|
+
# end
|
151
|
+
# end
|
152
|
+
#
|
153
|
+
def commit(message)
|
154
|
+
yield self if block_given?
|
155
|
+
|
156
|
+
git('commit','-m',message.to_s)
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
#
|
162
|
+
# Executes a `git` command in the archive root directory..
|
163
|
+
#
|
164
|
+
# @param [Array<String>] args
|
165
|
+
# Additional arguments for the `git` command.
|
166
|
+
#
|
167
|
+
# @return [true]
|
168
|
+
# Indicates that the `git` command executed successfully.
|
169
|
+
#
|
170
|
+
# @raise [GitError]
|
171
|
+
# Indicates that the `git` command exited with an error.
|
172
|
+
#
|
173
|
+
# @raise [GitNotInstalled]
|
174
|
+
# Indicates that `git` was not installed or could not be found in the
|
175
|
+
# `$PATH` environment variable.
|
176
|
+
#
|
177
|
+
def git(*args)
|
178
|
+
command = ['git', '-C', @root]
|
179
|
+
command.concat(args)
|
180
|
+
|
181
|
+
case system(*command)
|
182
|
+
when false
|
183
|
+
raise(GitError,"git command failed: #{command.join(' ')}")
|
184
|
+
when nil
|
185
|
+
raise(GitNotInstalled,"the git command was not found")
|
186
|
+
else
|
187
|
+
true
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#
|
2
|
+
# ronin-web-spider - A collection of common web spidering routines.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2006-2022 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
|
+
#
|
6
|
+
# ronin-web-spider is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published
|
8
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# ronin-web-spider is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public License
|
17
|
+
# along with ronin-web-spider. If not, see <https://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
|
20
|
+
module Ronin
|
21
|
+
module Web
|
22
|
+
module Spider
|
23
|
+
# ronin-web-spider version
|
24
|
+
VERSION = '0.1.0.beta1'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|