recluse 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +235 -0
- data/Rakefile +48 -0
- data/exe/recluse +5 -0
- data/lib/recluse.rb +7 -0
- data/lib/recluse/cli/blacklist.rb +59 -0
- data/lib/recluse/cli/main.rb +287 -0
- data/lib/recluse/cli/profile.rb +117 -0
- data/lib/recluse/cli/roots.rb +59 -0
- data/lib/recluse/cli/whitelist.rb +59 -0
- data/lib/recluse/hashtree.rb +172 -0
- data/lib/recluse/info.rb +9 -0
- data/lib/recluse/link.rb +89 -0
- data/lib/recluse/profile.rb +292 -0
- data/lib/recluse/result.rb +42 -0
- data/lib/recluse/statuscode.rb +91 -0
- data/recluse.gemspec +34 -0
- metadata +233 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9bbde60ecb2e16552e859d1bb4ccf9bf75f7c52f
|
4
|
+
data.tar.gz: 7783561aaa9cdb24f40be6c5a328102b44b76e48
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: eb9b94b6e6b15abf67b23b8ac27ea0206ad6dfb053377f68d441236e2dfc5bcece512e3b3ed21ff51ef3ea637f35f510801279f6714462fa1a5331d3dd5bf930
|
7
|
+
data.tar.gz: b2566d865fa51d7bd365fa92ce25807954fc8cd9330b8b75cbb5c46b242b583d57f6f78714a00687ce43dd8349be3c808439a150a07c8f8b8f0a19db626e3ab0
|
data/.gitignore
ADDED
data/.rubocop.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 James Anthony Bruno
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
# Recluse
|
2
|
+
|
3
|
+
**Recluse** is a web crawler meant to ease quality assurance. Currently, it has three crawling tests:
|
4
|
+
|
5
|
+
- **Status**—checks the [HTTP status codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) of links on the site. Good for detecting broken links.
|
6
|
+
- **Find**—finds pages with links matching the pattern. Good for ensuring that references to a page are removed or renamed.
|
7
|
+
- **Assert**—checks pages for the existence of HTML elements. Good for asserting that things are consistent across pages.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'recluse'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install recluse
|
24
|
+
|
25
|
+
## Profiles
|
26
|
+
|
27
|
+
Recluse depends on creating profiles for your sites. This way, the configuration can be reusable for frequent quality assurance checks. Profiles are saved as YAML files (.yaml) in `~/.recluse/` and have the following format:
|
28
|
+
|
29
|
+
```yaml
|
30
|
+
---
|
31
|
+
name: profile_name
|
32
|
+
roots:
|
33
|
+
- http://example.com/
|
34
|
+
- http://anotherroot.biz/subdir
|
35
|
+
email: email@domain.com
|
36
|
+
blacklist:
|
37
|
+
- http://example.com/dontgohere/*
|
38
|
+
whitelist:
|
39
|
+
- http://example.com/dontgohere/unlessitshere/*
|
40
|
+
internal_only: false
|
41
|
+
scheme_squash: false
|
42
|
+
redirect: false
|
43
|
+
```
|
44
|
+
|
45
|
+
### Profile options
|
46
|
+
|
47
|
+
| Name | Required | Type | Default | Description |
|
48
|
+
|------|----------|------|---------|-------------|
|
49
|
+
| name | Yes | String | | The name of your profile for identification. Should also match the filename (i.e., `site` has filename `site.yaml`). |
|
50
|
+
| roots | Yes | Array of URLs | | The roots to start from for spidering. Will spider all subdirectories and files. |
|
51
|
+
| email | Yes | String | | Your email. This is for identification of who is crawling a web page in case a system administrator has issues with it. |
|
52
|
+
| blacklist | No | Array of globs | Empty array | [Glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) of sites not to spider. Useful to keep Recluse focused only on the important stuff. |
|
53
|
+
| whitelist | No | Array of globs | Empty array | [Glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) of sites to spider, even if they are blacklisted. |
|
54
|
+
| internal_only | No | Boolean | `false` | If true, Recluse will not follow external links. If false, it will follow for the **status** mode. |
|
55
|
+
| scheme_squash | No | Boolean | `false` | Treats "http" URLs the same as "https". This way, Recluse will not redundantly spider secure and nonsecure duplicates of the same page. |
|
56
|
+
| redirect | No | Boolean | `false` | Follow the redirect to the resulting page if true. |
|
57
|
+
|
58
|
+
## Use
|
59
|
+
|
60
|
+
After installation, the `recluse` executable should be available for your command line.
|
61
|
+
|
62
|
+
### Tests
|
63
|
+
|
64
|
+
#### Status
|
65
|
+
|
66
|
+
Spiders through the profile and reports the [HTTP status codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) of the links. If the profile is not internal only, external links will also have their statuses checked.
|
67
|
+
|
68
|
+
$ recluse status csv_path profile1 [profile2] ... [options]
|
69
|
+
|
70
|
+
| Argument | Alias | Required | Type | Default | Description |
|
71
|
+
|----------|-------|----------|------|---------|-------------|
|
72
|
+
| csv_path | | Yes | String | | The path of where to save results. Results are saved as CSV (comma-separated values). |
|
73
|
+
| profiles | | Yes | Array of profile names | | List of profiles to check. More than one profile can be checked in one run. |
|
74
|
+
| group_by | `--group-by`<br/>`-g` | No | One of `none` or `url` | `none` | What to group by in the result output. If `none`, there will be a row for each pair of checked URL and the page it was found on. If `url`, there will be one row for each URL, and the page cell will have a list of every page the URL was found on. |
|
75
|
+
| include | `--include`<br/>`-i` | No | Array of status codes | Include all | Include these status codes in the results. Can be a specific number (ex: `200`) or a wildcard (ex: `2xx`). You can also include `idk` for pages that result in errors that prevent status code detection. |
|
76
|
+
| exclude | `--exclude`<br/>`-x` | No | Array of status codes | Exclude none | Exclude these status codes from the results. Same format as including. |
|
77
|
+
|
78
|
+
##### Output format
|
79
|
+
|
80
|
+
```csv
|
81
|
+
Status code,URL,On page,With error
|
82
|
+
```
|
83
|
+
|
84
|
+
#### Find
|
85
|
+
|
86
|
+
Spiders through the profiles and checks if a link matching one of the provided patterns is found. Will only go over internal pages.
|
87
|
+
|
88
|
+
$ recluse find csv_path profile1 [profile2] ... --globs pattern1 [pattern2] ... [options]
|
89
|
+
|
90
|
+
| Argument | Alias | Required | Type | Default | Description |
|
91
|
+
|----------|-------|----------|------|---------|-------------|
|
92
|
+
| csv_path | | Yes | String | | The path of where to save results. Results are saved as CSV (comma-separated values). |
|
93
|
+
| profiles | | Yes | Array of profile names | | List of profiles to check. More than one profile can be checked in one run. |
|
94
|
+
| globs | `--globs`<br/>`-G` | Yes | Array of globs | | [Glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) to find as URLs of links on the page. |
|
95
|
+
| group_by | `--group-by`<br/>`-g` | No | One of `none`, `url`, or `page` | `none` | What to group by in the result output. If `none`, there will be a row for each pair of checked URL and the page it was found on. If `url`, there will be one row for each URL, and the page cell will have a list of every page the URL was found on. If `page`, there will be one row for each page, and the URL cell will list every matching URL found on the page. |
|
96
|
+
|
97
|
+
##### Output format
|
98
|
+
|
99
|
+
###### Group by `none` or `url`
|
100
|
+
|
101
|
+
```csv
|
102
|
+
Matching URLs,Pages
|
103
|
+
```
|
104
|
+
|
105
|
+
###### Group by `page`
|
106
|
+
|
107
|
+
```csv
|
108
|
+
Page,Matching URLs
|
109
|
+
```
|
110
|
+
|
111
|
+
#### Assert
|
112
|
+
|
113
|
+
Asserts the existence of an HTML element using [CSS-style selectors](https://developer.mozilla.org/en-US/docs/Web/Guide/CSS/Getting_started/Selectors). Will only check internal pages.
|
114
|
+
|
115
|
+
$ recluse assert csv_path profile1 [profile2] ... --exists selector1 [selector2] ...
|
116
|
+
|
117
|
+
| Argument | Alias | Required | Type | Default | Description |
|
118
|
+
|----------|-------|----------|------|---------|-------------|
|
119
|
+
| csv_path | | Yes | String | | The path of where to save results. Results are saved as CSV (comma-separated values). |
|
120
|
+
| profiles | | Yes | Array of profile names | | List of profiles to check. More than one profile can be checked in one run. |
|
121
|
+
| true | `--true`<br/>`--report-true-only` | No | Boolean | `false` | Report only true assertions. Reports both true and false assertions by default. |
|
122
|
+
| false | `--false`<br/>`--report-false-only` | No | Boolean | `false` | Report only false assertions. Reports both true and false assertions by default. |
|
123
|
+
| exists | `--exists`<br/>`-e` | Yes | Array of CSS selectors | | CSS selectors to assert the existence of on each spidered page. |
|
124
|
+
|
125
|
+
##### Output format
|
126
|
+
|
127
|
+
```csv
|
128
|
+
Selector,Exists,On page
|
129
|
+
```
|
130
|
+
|
131
|
+
### Profile management
|
132
|
+
|
133
|
+
#### Where
|
134
|
+
|
135
|
+
Path where the profiles are stored for manual edits.
|
136
|
+
|
137
|
+
$ recluse where
|
138
|
+
|
139
|
+
#### Creation
|
140
|
+
|
141
|
+
Create a profile.
|
142
|
+
|
143
|
+
$ recluse profile create [options] name email root1 [root2] ...
|
144
|
+
|
145
|
+
For further description of the arguments, check the **Profile options** section.
|
146
|
+
|
147
|
+
| Argument | Alias | Required | Type | Default |
|
148
|
+
|----------|-------|----------|------|---------|
|
149
|
+
| name | | Yes | String | |
|
150
|
+
| email | | Yes | String | |
|
151
|
+
| roots | | Yes | Array of strings | |
|
152
|
+
| blacklist | `--blacklist` | No | Array of globs | Empty array |
|
153
|
+
| whitelist | `--whitelist` | No | Array of globs | Empty array |
|
154
|
+
| internal_only | `--internal_only`<br/>`--no-internal-only` | No | Boolean | `false` |
|
155
|
+
| scheme_squash | `--scheme-squash`<br/>`--no-scheme-squash` | No | Boolean | `false` |
|
156
|
+
| redirect | `--redirect`<br/>`--no-redirect` | No | Boolean | `false` |
|
157
|
+
|
158
|
+
#### Edit
|
159
|
+
|
160
|
+
Edit profile options. Any option not provided will stay as it was.
|
161
|
+
|
162
|
+
$ recluse profile edit name [options]
|
163
|
+
|
164
|
+
| Argument | Alias | Required | Type |
|
165
|
+
|----------|-------|----------|------|
|
166
|
+
| name | | Yes | String |
|
167
|
+
| email | `--email` | No | String |
|
168
|
+
| roots | `--roots` | No | Array of strings |
|
169
|
+
| blacklist | `--blacklist` | No | Array of globs |
|
170
|
+
| whitelist | `--whitelist` | No | Array of globs |
|
171
|
+
| internal_only | `--internal_only`<br/>`--no-internal-only` | No | Boolean |
|
172
|
+
| scheme_squash | `--scheme-squash`<br/>`--no-scheme-squash` | No | Boolean |
|
173
|
+
| redirect | `--redirect`<br/>`--no-redirect` | No | Boolean |
|
174
|
+
|
175
|
+
##### Blacklist, whitelist, and roots
|
176
|
+
|
177
|
+
More powerful blacklist and whitelist editing. All examples are interchangeable between the three list types. However, if the profile has no roots, it will not run.
|
178
|
+
|
179
|
+
###### Add
|
180
|
+
|
181
|
+
Add patterns/roots to the profile's list.
|
182
|
+
|
183
|
+
$ recluse profile blacklist add name new_thing1 [new_thing2] ...
|
184
|
+
|
185
|
+
###### Remove
|
186
|
+
|
187
|
+
Remove patterns/roots from the profile's list.
|
188
|
+
|
189
|
+
$ recluse profile blacklist remove name thing1 [thing2] ...
|
190
|
+
|
191
|
+
###### Clear
|
192
|
+
|
193
|
+
Remove all patterns/roots from the profile's list.
|
194
|
+
|
195
|
+
$ recluse profile blacklist clear name
|
196
|
+
|
197
|
+
###### List
|
198
|
+
|
199
|
+
List the patterns/roots in the profile's list.
|
200
|
+
|
201
|
+
$ recluse profile blacklist list name
|
202
|
+
|
203
|
+
#### Remove
|
204
|
+
|
205
|
+
Delete a profile.
|
206
|
+
|
207
|
+
$ recluse profile remove name
|
208
|
+
|
209
|
+
#### Rename
|
210
|
+
|
211
|
+
Rename a profile.
|
212
|
+
|
213
|
+
$ recluse profile rename old_name new_name
|
214
|
+
|
215
|
+
#### List
|
216
|
+
|
217
|
+
List all profiles.
|
218
|
+
|
219
|
+
$ recluse profile list
|
220
|
+
|
221
|
+
#### Info
|
222
|
+
|
223
|
+
List the YAML info of the profile.
|
224
|
+
|
225
|
+
$ recluse profile info name
|
226
|
+
|
227
|
+
## Contributing
|
228
|
+
|
229
|
+
Bug reports and pull requests are welcome on GitHub.
|
230
|
+
|
231
|
+
|
232
|
+
## License
|
233
|
+
|
234
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
235
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
task default: :spec
|
6
|
+
|
7
|
+
task :rubocop do
|
8
|
+
sh('rubocop -aSE') { |ok, res| }
|
9
|
+
end
|
10
|
+
|
11
|
+
namespace :test do
|
12
|
+
desc 'Setup test environment'
|
13
|
+
task :before do
|
14
|
+
$stderr.puts 'Starting server'
|
15
|
+
out_log = './test/site/logs/httpd-out.log'
|
16
|
+
err_log = './test/site/logs/httpd-err.log'
|
17
|
+
FileUtils.touch(out_log) unless File.exist?(out_log)
|
18
|
+
FileUtils.touch(err_log) unless File.exist?(err_log)
|
19
|
+
|
20
|
+
@server = Process.spawn(
|
21
|
+
'ruby -run -e httpd ./test/site/ -p 9533',
|
22
|
+
in: :close,
|
23
|
+
out: out_log,
|
24
|
+
err: err_log
|
25
|
+
)
|
26
|
+
sleep 1
|
27
|
+
end
|
28
|
+
|
29
|
+
Rake::TestTask.new(:run) do |t|
|
30
|
+
t.pattern = 'test/*.rb'
|
31
|
+
t.verbose = false
|
32
|
+
end
|
33
|
+
|
34
|
+
desc 'Teardown test environment'
|
35
|
+
task :after do
|
36
|
+
$stderr.puts 'Stopping server'
|
37
|
+
Process.kill 'TERM', @server
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test do
|
42
|
+
Rake::Task['test:before'].invoke
|
43
|
+
begin
|
44
|
+
Rake::Task['test:run'].invoke
|
45
|
+
ensure
|
46
|
+
Rake::Task['test:after'].invoke
|
47
|
+
end
|
48
|
+
end
|
data/exe/recluse
ADDED
data/lib/recluse.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'user_config'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
module CLI
|
6
|
+
##
|
7
|
+
# Blacklist related commands.
|
8
|
+
class Blacklist < Thor #:nodoc: all
|
9
|
+
desc 'add profile pattern1 [pattern2] ...', 'add glob patterns to blacklist'
|
10
|
+
def add(name, *patterns)
|
11
|
+
uconf = UserConfig.new '.recluse'
|
12
|
+
unless uconf.exist?("#{name}.yaml")
|
13
|
+
puts "Profile #{name} doesn't exist"
|
14
|
+
exit(-1)
|
15
|
+
end
|
16
|
+
profile = uconf["#{name}.yaml"]
|
17
|
+
if profile.key?('blacklist')
|
18
|
+
profile['blacklist'] += patterns
|
19
|
+
else
|
20
|
+
profile['blacklist'] = patterns
|
21
|
+
end
|
22
|
+
profile.save
|
23
|
+
end
|
24
|
+
desc 'remove profile pattern1 [pattern2] ...', 'remove glob patterns from blacklist'
|
25
|
+
def remove(name, *patterns)
|
26
|
+
uconf = UserConfig.new '.recluse'
|
27
|
+
unless uconf.exist?("#{name}.yaml")
|
28
|
+
puts "Profile #{name} doesn't exist"
|
29
|
+
exit(-1)
|
30
|
+
end
|
31
|
+
profile = uconf["#{name}.yaml"]
|
32
|
+
return unless profile.key?('blacklist')
|
33
|
+
profile['blacklist'] -= patterns
|
34
|
+
profile.save
|
35
|
+
end
|
36
|
+
desc 'clear profile', 'remove all patterns in the blacklist'
|
37
|
+
def clear(name)
|
38
|
+
uconf = UserConfig.new '.recluse'
|
39
|
+
unless uconf.exist?("#{name}.yaml")
|
40
|
+
puts "Profile #{name} doesn't exist"
|
41
|
+
exit(-1)
|
42
|
+
end
|
43
|
+
profile = uconf["#{name}.yaml"]
|
44
|
+
profile['blacklist'] = []
|
45
|
+
profile.save
|
46
|
+
end
|
47
|
+
desc 'list profile', 'list patterns in blacklist'
|
48
|
+
def list(name)
|
49
|
+
uconf = UserConfig.new '.recluse'
|
50
|
+
unless uconf.exist?("#{name}.yaml")
|
51
|
+
puts "Profile #{name} doesn't exist"
|
52
|
+
exit(-1)
|
53
|
+
end
|
54
|
+
profile = uconf["#{name}.yaml"]
|
55
|
+
profile['blacklist'].each { |pattern| puts pattern } if profile.key?('blacklist')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,287 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'recluse/profile'
|
3
|
+
require 'recluse/statuscode'
|
4
|
+
require 'recluse/cli/profile'
|
5
|
+
require 'csv'
|
6
|
+
require 'user_config'
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
module Recluse
|
10
|
+
##
|
11
|
+
# Command-line interface segments.
|
12
|
+
module CLI
|
13
|
+
##
|
14
|
+
# Main commands.
|
15
|
+
class Main < Thor #:nodoc: all
|
16
|
+
no_commands do
|
17
|
+
def perc(num, den)
|
18
|
+
(num * 100.0 / den).round(2)
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_save(profile, csv_path, group_by: :none)
|
22
|
+
puts "\nSaving report..."
|
23
|
+
child_count = 0
|
24
|
+
parent_count = 0
|
25
|
+
case group_by
|
26
|
+
when :page
|
27
|
+
report = profile.results.parents
|
28
|
+
CSV.open(csv_path, 'w+') do |csv|
|
29
|
+
csv << ['Page', 'Matching URLs']
|
30
|
+
report.each do |parent, children|
|
31
|
+
next if children.empty?
|
32
|
+
child_count += children.length
|
33
|
+
csv << [parent, children.join("\n")]
|
34
|
+
parent_count += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
when :none
|
38
|
+
report = profile.results.parents
|
39
|
+
CSV.open(csv_path, 'w+') do |csv|
|
40
|
+
csv << ['Matching URL', 'Page']
|
41
|
+
report.each do |parent, children|
|
42
|
+
child_count += children.length
|
43
|
+
children.each do |child|
|
44
|
+
csv << [child, parent]
|
45
|
+
end
|
46
|
+
parent_count += 1 unless children.empty?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
when :url
|
50
|
+
report = profile.results.children
|
51
|
+
CSV.open(csv_path, 'w+') do |csv|
|
52
|
+
csv << ['Matching URL', 'Pages']
|
53
|
+
parents = Set.new
|
54
|
+
report.each do |child, info|
|
55
|
+
child_count += 1
|
56
|
+
unless info[:parents].empty?
|
57
|
+
csv << [child, info[:parents].join("\n")]
|
58
|
+
parents += info[:parents]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
parent_count = parents.length
|
62
|
+
end
|
63
|
+
end
|
64
|
+
total = profile.results.parents.keys.length
|
65
|
+
puts "Total pages:\t#{total}"
|
66
|
+
puts "Matched URLs:\t#{child_count}"
|
67
|
+
puts "Pages with matches:\t#{parent_count}\t#{perc parent_count, total}%"
|
68
|
+
end
|
69
|
+
|
70
|
+
def status_save(profile, csv_path, group_by: :none, includes: [], excludes: [])
|
71
|
+
puts 'Saving report...'
|
72
|
+
counts = {}
|
73
|
+
case group_by
|
74
|
+
when :url
|
75
|
+
page_label = 'On pages'
|
76
|
+
to_csv = proc do |csv, status, child, parents, error|
|
77
|
+
csv << [status, child, parents.join("\n"), error || '']
|
78
|
+
end
|
79
|
+
when :none
|
80
|
+
page_label = 'On page'
|
81
|
+
to_csv = proc do |csv, status, child, parents, error|
|
82
|
+
parents.each do |parent|
|
83
|
+
csv << [status, child, parent, error || '']
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
valid_status = proc do |code|
|
88
|
+
(includes.any? { |include_code| include_code.equal?(code) }) && (excludes.none? { |exclude_code| exclude_code.equal?(code) })
|
89
|
+
end
|
90
|
+
report = profile.results.children
|
91
|
+
CSV.open(csv_path, 'w+') do |csv|
|
92
|
+
csv << ['Status code', 'URL', page_label, 'With error']
|
93
|
+
report.each do |child, info|
|
94
|
+
val = info[:value]
|
95
|
+
if val.nil?
|
96
|
+
status = 'idk'
|
97
|
+
error = 'Incomplete'
|
98
|
+
else
|
99
|
+
status = val.code
|
100
|
+
error = val.error
|
101
|
+
end
|
102
|
+
if valid_status.call(status)
|
103
|
+
to_csv.call(csv, status, child, info[:parents], error)
|
104
|
+
end
|
105
|
+
if counts.key?(status)
|
106
|
+
counts[status] += 1.0
|
107
|
+
else
|
108
|
+
counts[status] = 1.0
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
puts "Total:\t#{report.length}"
|
113
|
+
counts.each do |code, count|
|
114
|
+
valid = valid_status.call code
|
115
|
+
puts "#{code}:\t#{count.to_i}\t#{perc count, report.length}%\t#{valid ? 'Reported' : 'Unreported'}"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def assert_save(profile, csv_path, report_vals)
|
120
|
+
puts 'Saving report...'
|
121
|
+
report = profile.results.children
|
122
|
+
counts = {}
|
123
|
+
CSV.open(csv_path, 'w+') do |csv|
|
124
|
+
csv << ['Selector', 'Exists', 'On page']
|
125
|
+
report.each do |child, info|
|
126
|
+
val = info[:value]
|
127
|
+
next if val.nil?
|
128
|
+
val.each do |selector, exists|
|
129
|
+
counts[selector] = { 'true' => 0, 'false' => 0 } unless counts.key? selector
|
130
|
+
counts[selector][exists.to_s] += 1
|
131
|
+
csv << [selector, exists.to_s, child] if report_vals.include?(exists)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
puts "Total pages:\t#{report.keys.length}"
|
136
|
+
counts.each do |selector, _info|
|
137
|
+
puts "#{selector}:"
|
138
|
+
puts "- True: #{counts[selector]['true']}\t#{perc counts[selector]['true'], report.keys.length}%"
|
139
|
+
puts "- False: #{counts[selector]['false']}\t#{perc counts[selector]['false'], report.keys.length}%"
|
140
|
+
unknown = report.keys.length - counts[selector]['false'] - counts[selector]['true']
|
141
|
+
puts "- Unknown: #{unknown}\t#{perc unknown, report.keys.length}%"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
method_option :group_by, type: :string, aliases: '-g', default: 'none', enum: %w(none url), desc: 'Group by key'
|
147
|
+
method_option :include, type: :array, aliases: '-i', default: ['xxx'], desc: "Include these status code results. Can be numbers or wildcards (4xx). 'idk' is a Recluse status code for when the status cannot be determined for the page."
|
148
|
+
method_option :exclude, type: :array, aliases: '-x', default: [], desc: "Exclude these status code results. Can be numbers or wildcards (4xx). 'idk' is a Recluse status code for when the status cannot be determined for the page."
|
149
|
+
desc 'status csv_path profile1 [profile2] ...', 'runs report on link statuses'
|
150
|
+
def status(csv_path, *profiles)
|
151
|
+
if profiles.empty?
|
152
|
+
puts 'No profile provided'
|
153
|
+
exit(-1)
|
154
|
+
end
|
155
|
+
begin
|
156
|
+
profile_queue = profiles.map { |profile_name| Recluse::Profile.load profile_name }
|
157
|
+
rescue ProfileError => e
|
158
|
+
puts e
|
159
|
+
exit(-1)
|
160
|
+
end
|
161
|
+
profile = profile_queue[0]
|
162
|
+
if options['group_by'] == 'page'
|
163
|
+
puts 'Page grouping only available with --find.'
|
164
|
+
exit(-1)
|
165
|
+
end
|
166
|
+
begin
|
167
|
+
includes = options[:include].map { |code| Recluse::StatusCode.new code }
|
168
|
+
if includes.empty?
|
169
|
+
puts 'No status codes'
|
170
|
+
exit(-1)
|
171
|
+
end
|
172
|
+
excludes = options[:exclude].map { |code| Recluse::StatusCode.new code }
|
173
|
+
rescue StatusCodeError => e
|
174
|
+
puts e
|
175
|
+
exit(-1)
|
176
|
+
end
|
177
|
+
ending = proc do
|
178
|
+
status_save profile, csv_path, group_by: options['group_by'].to_sym, includes: includes, excludes: excludes
|
179
|
+
exit
|
180
|
+
end
|
181
|
+
%w(INT TERM).each do |sig|
|
182
|
+
Signal.trap sig, &ending
|
183
|
+
end
|
184
|
+
(0...profile_queue.length).each do |i|
|
185
|
+
profile.results = profile_queue[i - 1].results unless i.zero?
|
186
|
+
profile.status
|
187
|
+
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
188
|
+
end
|
189
|
+
%w(INT TERM).each do |sig|
|
190
|
+
Signal.trap sig, 'DEFAULT'
|
191
|
+
end
|
192
|
+
ending.call
|
193
|
+
end
|
194
|
+
method_option :globs, type: :array, aliases: '-G', required: true, banner: 'GLOB', desc: 'Find links matching any of the globs'
|
195
|
+
method_option :group_by, type: :string, aliases: '-g', default: 'none', enum: %w(none url page), desc: 'Group by key'
|
196
|
+
desc 'find csv_path profile1 [profile2] ... --globs glob1 [glob2] ...', 'find matching links'
|
197
|
+
def find(csv_path, *profiles)
|
198
|
+
if profiles.empty?
|
199
|
+
puts 'No profile provided'
|
200
|
+
exit(-1)
|
201
|
+
end
|
202
|
+
begin
|
203
|
+
profile_queue = profiles.map { |profile_name| Recluse::Profile.load profile_name }
|
204
|
+
rescue ProfileError => e
|
205
|
+
puts e
|
206
|
+
exit(-1)
|
207
|
+
end
|
208
|
+
profile = profile_queue[0]
|
209
|
+
has_globs = options['globs'].any? { |glob| !glob.strip.empty? }
|
210
|
+
unless has_globs
|
211
|
+
puts 'No glob patterns provided for --globs option'
|
212
|
+
exit(-1)
|
213
|
+
end
|
214
|
+
ending = proc do
|
215
|
+
find_save profile, csv_path, group_by: options['group_by'].to_sym
|
216
|
+
exit
|
217
|
+
end
|
218
|
+
%w(INT TERM).each do |sig|
|
219
|
+
Signal.trap sig, &ending
|
220
|
+
end
|
221
|
+
(0...profile_queue.length).each do |i|
|
222
|
+
profile.results = profile_queue[i - 1].results unless i.zero?
|
223
|
+
profile.find options['globs']
|
224
|
+
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
225
|
+
end
|
226
|
+
%w(INT TERM).each do |sig|
|
227
|
+
Signal.trap sig, 'DEFAULT'
|
228
|
+
end
|
229
|
+
ending.call
|
230
|
+
end
|
231
|
+
method_option :exists, type: :array, aliases: '-e', required: true, banner: 'SELECTOR', desc: 'Assert existence of HTML elements matching CSS selector'
|
232
|
+
method_option :report_true_only, type: :boolean, aliases: '--true', default: false, desc: 'Report only true assertions. Default is to report both true and false.'
|
233
|
+
method_option :report_false_only, type: :boolean, aliases: '--false', default: false, desc: 'Report only false assertions. Default is to report both true and false.'
|
234
|
+
desc 'assert csv_path profile1 [profile2] ... [options] --exists selector1 [selector2] ...', 'assert HTML element existence'
|
235
|
+
def assert(csv_path, *profiles)
|
236
|
+
if profiles.empty?
|
237
|
+
puts 'No profile provided'
|
238
|
+
exit(-1)
|
239
|
+
end
|
240
|
+
begin
|
241
|
+
profile_queue = profiles.map { |profile_name| Recluse::Profile.load profile_name }
|
242
|
+
rescue ProfileError => e
|
243
|
+
puts e
|
244
|
+
exit(-1)
|
245
|
+
end
|
246
|
+
profile = profile_queue[0]
|
247
|
+
has_selectors = options['exists'].any? { |selector| !selector.strip.empty? }
|
248
|
+
unless has_selectors
|
249
|
+
puts 'No selector patterns provided for --exists option'
|
250
|
+
exit(-1)
|
251
|
+
end
|
252
|
+
report = []
|
253
|
+
if options[:report_false_only] == options[:report_true_only]
|
254
|
+
report = [true, false]
|
255
|
+
elsif options[:report_true_only]
|
256
|
+
report = [true]
|
257
|
+
elsif option[:report_false_only]
|
258
|
+
report = [false]
|
259
|
+
end
|
260
|
+
ending = proc do
|
261
|
+
assert_save profile, csv_path, report
|
262
|
+
exit
|
263
|
+
end
|
264
|
+
|
265
|
+
%w(INT TERM).each do |sig|
|
266
|
+
Signal.trap sig, &ending
|
267
|
+
end
|
268
|
+
(0...profile_queue.length).each do |i|
|
269
|
+
profile.results = profile_queue[i - 1].results unless i.zero?
|
270
|
+
profile.assert options['exists']
|
271
|
+
profile = profile_queue[i + 1] if i + 1 < profile_queue.length
|
272
|
+
end
|
273
|
+
%w(INT TERM).each do |sig|
|
274
|
+
Signal.trap sig, 'DEFAULT'
|
275
|
+
end
|
276
|
+
ending.call
|
277
|
+
end
|
278
|
+
desc 'where', 'location of profiles'
|
279
|
+
def where
|
280
|
+
uconf = UserConfig.new '.recluse'
|
281
|
+
puts uconf.directory
|
282
|
+
end
|
283
|
+
desc 'profile [subcommand] [options]', 'profile editor'
|
284
|
+
subcommand 'profile', Profile
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|