recluse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rubocop.yml +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +235 -0
- data/Rakefile +48 -0
- data/exe/recluse +5 -0
- data/lib/recluse.rb +7 -0
- data/lib/recluse/cli/blacklist.rb +59 -0
- data/lib/recluse/cli/main.rb +287 -0
- data/lib/recluse/cli/profile.rb +117 -0
- data/lib/recluse/cli/roots.rb +59 -0
- data/lib/recluse/cli/whitelist.rb +59 -0
- data/lib/recluse/hashtree.rb +172 -0
- data/lib/recluse/info.rb +9 -0
- data/lib/recluse/link.rb +89 -0
- data/lib/recluse/profile.rb +292 -0
- data/lib/recluse/result.rb +42 -0
- data/lib/recluse/statuscode.rb +91 -0
- data/recluse.gemspec +34 -0
- metadata +233 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'recluse/profile'
|
3
|
+
require 'recluse/cli/whitelist'
|
4
|
+
require 'recluse/cli/blacklist'
|
5
|
+
require 'recluse/cli/roots'
|
6
|
+
require 'user_config'
|
7
|
+
|
8
|
+
module Recluse
|
9
|
+
module CLI
|
10
|
+
##
|
11
|
+
# Commands to edit/create/delete profiles.
|
12
|
+
class Profile < Thor #:nodoc: all
|
13
|
+
method_option :blacklist, type: :array, desc: 'Glob patterns for URLs to ignore', default: []
|
14
|
+
method_option :whitelist, type: :array, desc: 'Glob pattern exceptions to blacklist', default: []
|
15
|
+
method_option :internal_only, type: :boolean, desc: 'Only check internal URLs', default: false
|
16
|
+
method_option :scheme_squash, type: :boolean, desc: 'HTTP and HTTPS URLs are treated as equals', default: false
|
17
|
+
method_option :redirect, type: :boolean, desc: 'Follow redirects and report final status code', default: false
|
18
|
+
desc 'create [options] name email root1 [root2] ...', 'create profile'
|
19
|
+
def create(name, email, *roots)
|
20
|
+
uconf = UserConfig.new '.recluse'
|
21
|
+
if uconf.exist?("#{name}.yaml")
|
22
|
+
puts "Profile #{name} already exists"
|
23
|
+
exit(-1)
|
24
|
+
end
|
25
|
+
begin
|
26
|
+
profile = Recluse::Profile.new(
|
27
|
+
name,
|
28
|
+
roots,
|
29
|
+
email,
|
30
|
+
blacklist: options['blacklist'],
|
31
|
+
whitelist: options['whitelist'],
|
32
|
+
internal_only: options['internal_only'],
|
33
|
+
scheme_squash: options['scheme_squash'],
|
34
|
+
redirect: options['redirect']
|
35
|
+
)
|
36
|
+
profile.save
|
37
|
+
rescue ProfileError => e
|
38
|
+
puts e
|
39
|
+
exit(-1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
desc 'profile remove name', 'remove profile'
|
43
|
+
def remove(name)
|
44
|
+
uconf = UserConfig.new '.recluse'
|
45
|
+
if uconf.exist?("#{name}.yaml")
|
46
|
+
uconf.delete "#{name}.yaml"
|
47
|
+
else
|
48
|
+
exit(-1)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
method_option :blacklist, type: :array, desc: 'Glob patterns for URLs to ignore'
|
52
|
+
method_option :whitelist, type: :array, desc: 'Glob pattern exceptions to blacklist'
|
53
|
+
method_option :internal_only, type: :boolean, desc: 'Only check internal URLs'
|
54
|
+
method_option :scheme_squash, type: :boolean, desc: 'HTTP and HTTPS URLs are treated as equals'
|
55
|
+
method_option :roots, type: :array, desc: 'Roots to start the spidering at'
|
56
|
+
method_option :email, type: :string, desc: 'Email to identify spider for system admins'
|
57
|
+
method_option :redirect, type: :boolean, desc: 'Follow redirects and report final status code'
|
58
|
+
desc 'edit name [options]', 'edit profile'
|
59
|
+
def edit(name)
|
60
|
+
begin
|
61
|
+
profile = Recluse::Profile.load name
|
62
|
+
rescue ProfileError => e
|
63
|
+
puts e
|
64
|
+
exit(-1)
|
65
|
+
end
|
66
|
+
profile.roots = options['roots'] if options.key? 'roots'
|
67
|
+
profile.blacklist = options['blacklist'] if options.key? 'blacklist'
|
68
|
+
profile.whitelist = options['whitelist'] if options.key? 'whitelist'
|
69
|
+
profile.internal_only = options['internal_only'] if options.key? 'internal_only'
|
70
|
+
profile.scheme_squash = options['scheme_squash'] if options.key? 'scheme_squash'
|
71
|
+
profile.redirect = options['redirect'] if options.key? 'redirect'
|
72
|
+
profile.email = options['email'] if options.key? 'email'
|
73
|
+
profile.save
|
74
|
+
end
|
75
|
+
desc 'rename old_name new_name', 'rename profile'
|
76
|
+
def rename(old_name, new_name)
|
77
|
+
uconf = UserConfig.new '.recluse'
|
78
|
+
if uconf.exist?("#{new_name}.yaml")
|
79
|
+
puts "Profile #{new_name} already exists"
|
80
|
+
exit(-1)
|
81
|
+
end
|
82
|
+
return unless uconf.exist?("#{old_name}.yaml")
|
83
|
+
old_profile = uconf["#{old_name}.yaml"]
|
84
|
+
old_profile['name'] = new_name
|
85
|
+
new_profile = uconf["#{new_name}.yaml"]
|
86
|
+
old_profile.each do |key, value|
|
87
|
+
new_profile[key] = value
|
88
|
+
end
|
89
|
+
new_profile.save
|
90
|
+
uconf.delete "#{old_name}.yaml"
|
91
|
+
end
|
92
|
+
desc 'list', 'list profiles'
|
93
|
+
def list
|
94
|
+
uconf = UserConfig.new '.recluse'
|
95
|
+
files = uconf.list_in_directory '.'
|
96
|
+
files.each do |file|
|
97
|
+
puts file.gsub(/\.yaml$/, '')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
desc 'info name', 'profile information'
|
101
|
+
def info(name)
|
102
|
+
uconf = UserConfig.new '.recluse'
|
103
|
+
unless uconf.exist?("#{name}.yaml")
|
104
|
+
puts "Profile #{name} doesn't exist"
|
105
|
+
exit(-1)
|
106
|
+
end
|
107
|
+
puts uconf["#{name}.yaml"].to_yaml
|
108
|
+
end
|
109
|
+
desc 'blacklist [subcommand] [options]', 'edit blacklist'
|
110
|
+
subcommand 'blacklist', Blacklist
|
111
|
+
desc 'roots [subcommand] [options]', 'edit roots'
|
112
|
+
subcommand 'roots', Roots
|
113
|
+
desc 'whitelist [subcommand] [options]', 'edit whitelist'
|
114
|
+
subcommand 'whitelist', Whitelist
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'user_config'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
module CLI
|
6
|
+
##
|
7
|
+
# Roots related commands.
|
8
|
+
class Roots < Thor #:nodoc: all
|
9
|
+
desc 'add profile pattern1 [pattern2] ...', 'add to roots'
|
10
|
+
def add(name, *roots)
|
11
|
+
uconf = UserConfig.new '.recluse'
|
12
|
+
unless uconf.exist?("#{name}.yaml")
|
13
|
+
puts "Profile #{name} doesn't exist"
|
14
|
+
exit(-1)
|
15
|
+
end
|
16
|
+
profile = uconf["#{name}.yaml"]
|
17
|
+
if profile.key?('roots')
|
18
|
+
profile['roots'] += roots
|
19
|
+
else
|
20
|
+
profile['roots'] = roots
|
21
|
+
end
|
22
|
+
profile.save
|
23
|
+
end
|
24
|
+
desc 'remove profile pattern1 [pattern2] ...', 'remove from roots'
|
25
|
+
def remove(name, *roots)
|
26
|
+
uconf = UserConfig.new '.recluse'
|
27
|
+
unless uconf.exist?("#{name}.yaml")
|
28
|
+
puts "Profile #{name} doesn't exist"
|
29
|
+
exit(-1)
|
30
|
+
end
|
31
|
+
profile = uconf["#{name}.yaml"]
|
32
|
+
return unless profile.key?('roots')
|
33
|
+
profile['roots'] -= roots
|
34
|
+
profile.save
|
35
|
+
end
|
36
|
+
desc 'clear profile', 'remove all roots'
|
37
|
+
def clear(name)
|
38
|
+
uconf = UserConfig.new '.recluse'
|
39
|
+
unless uconf.exist?("#{name}.yaml")
|
40
|
+
puts "Profile #{name} doesn't exist"
|
41
|
+
exit(-1)
|
42
|
+
end
|
43
|
+
profile = uconf["#{name}.yaml"]
|
44
|
+
profile['roots'] = []
|
45
|
+
profile.save
|
46
|
+
end
|
47
|
+
desc 'list profile', 'list roots'
|
48
|
+
def list(name)
|
49
|
+
uconf = UserConfig.new '.recluse'
|
50
|
+
unless uconf.exist?("#{name}.yaml")
|
51
|
+
puts "Profile #{name} doesn't exist"
|
52
|
+
exit(-1)
|
53
|
+
end
|
54
|
+
profile = uconf["#{name}.yaml"]
|
55
|
+
profile['roots'].each { |root| puts root } if profile.key?('roots')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'user_config'
|
3
|
+
|
4
|
+
module Recluse
|
5
|
+
module CLI
|
6
|
+
##
|
7
|
+
# Whitelist related commands.
|
8
|
+
class Whitelist < Thor #:nodoc: all
|
9
|
+
desc 'add profile pattern1 [pattern2] ...', 'add glob patterns to whitelist'
|
10
|
+
def add(name, *patterns)
|
11
|
+
uconf = UserConfig.new '.recluse'
|
12
|
+
unless uconf.exist?("#{name}.yaml")
|
13
|
+
puts "Profile #{name} doesn't exist"
|
14
|
+
exit(-1)
|
15
|
+
end
|
16
|
+
profile = uconf["#{name}.yaml"]
|
17
|
+
if profile.key?('whitelist')
|
18
|
+
profile['whitelist'] += patterns
|
19
|
+
else
|
20
|
+
profile['whitelist'] = patterns
|
21
|
+
end
|
22
|
+
profile.save
|
23
|
+
end
|
24
|
+
desc 'remove profile pattern1 [pattern2] ...', 'remove patterns from whitelist'
|
25
|
+
def remove(name, *patterns)
|
26
|
+
uconf = UserConfig.new '.recluse'
|
27
|
+
unless uconf.exist?("#{name}.yaml")
|
28
|
+
puts "Profile #{name} doesn't exist"
|
29
|
+
exit(-1)
|
30
|
+
end
|
31
|
+
profile = uconf["#{name}.yaml"]
|
32
|
+
return unless profile.key?('whitelist')
|
33
|
+
profile['whitelist'] -= patterns
|
34
|
+
profile.save
|
35
|
+
end
|
36
|
+
desc 'clear profile', 'remove all patterns in the whitelist'
|
37
|
+
def clear(name)
|
38
|
+
uconf = UserConfig.new '.recluse'
|
39
|
+
unless uconf.exist?("#{name}.yaml")
|
40
|
+
puts "Profile #{name} doesn't exist"
|
41
|
+
exit(-1)
|
42
|
+
end
|
43
|
+
profile = uconf["#{name}.yaml"]
|
44
|
+
profile['whitelist'] = []
|
45
|
+
profile.save
|
46
|
+
end
|
47
|
+
desc 'list profile', 'list patterns in whitelist'
|
48
|
+
def list(name)
|
49
|
+
uconf = UserConfig.new '.recluse'
|
50
|
+
unless uconf.exist?("#{name}.yaml")
|
51
|
+
puts "Profile #{name} doesn't exist"
|
52
|
+
exit(-1)
|
53
|
+
end
|
54
|
+
profile = uconf["#{name}.yaml"]
|
55
|
+
profile['whitelist'].each { |pattern| puts pattern } if profile.key?('whitelist')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
module Recluse
|
2
|
+
##
|
3
|
+
# Sorta like a node tree but using two hashes for easy searching for parents and/or children.
|
4
|
+
# This way, it should have similar performance whether you're iterating over parents or children.
|
5
|
+
# Additionally, not every child will need a parent or they might not need a parent at initialization.
|
6
|
+
class HashTree
|
7
|
+
##
|
8
|
+
# Create a hash tree.
|
9
|
+
def initialize(&block)
|
10
|
+
@parent_keys = {}
|
11
|
+
@child_keys = {}
|
12
|
+
@equivalence = block.nil? ? (proc { |a, b| a == b }) : block
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Add child associated with parent(s).
|
17
|
+
def add(child, parents)
|
18
|
+
unless child?(child)
|
19
|
+
@child_keys[child] = {
|
20
|
+
value: nil,
|
21
|
+
parents: []
|
22
|
+
}
|
23
|
+
end
|
24
|
+
@child_keys[get_child_key(child)][:parents] += [*parents]
|
25
|
+
[*parents].each do |parent|
|
26
|
+
@parent_keys[parent] = [] unless parent?(parent)
|
27
|
+
@parent_keys[get_parent_key(parent)] << get_child_key(child)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
# Add parent with no children.
|
33
|
+
def add_parent(parents)
|
34
|
+
[*parents].each do |parent|
|
35
|
+
@parent_keys[parent] = [] unless parent?(parent)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Add child with no value and no parents.
|
41
|
+
def add_child(children)
|
42
|
+
[*children].each do |child|
|
43
|
+
next if child?(child)
|
44
|
+
@child_keys[child] = {
|
45
|
+
value: nil,
|
46
|
+
parents: []
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Set value of child.
|
53
|
+
def set_child_value(child, value)
|
54
|
+
@child_keys[get_child_key(child)][:value] = value
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# Get value of child.
|
59
|
+
def get_child_value(child)
|
60
|
+
@child_keys[get_child_key(child)][:value]
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# Get child's parents
|
65
|
+
def get_parents(child)
|
66
|
+
@child_keys[get_child_key(child)][:parents]
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Get parent's children
|
71
|
+
def get_children(parent)
|
72
|
+
@parent_keys[get_parent_key(parent)]
|
73
|
+
end
|
74
|
+
|
75
|
+
##
|
76
|
+
# Collect values of children for parent.
|
77
|
+
def get_values(parent)
|
78
|
+
vals = {}
|
79
|
+
@parent_keys[get_parent_key(parent)].each do |child|
|
80
|
+
vals[child] = @child_keys[child][:value]
|
81
|
+
end
|
82
|
+
vals
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# Get parents hash.
|
87
|
+
def parents
|
88
|
+
@parent_keys.dup
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Get children hash.
|
93
|
+
def children
|
94
|
+
@child_keys.dup
|
95
|
+
end
|
96
|
+
|
97
|
+
##
|
98
|
+
# Does element exist as a child and/or parent key?
|
99
|
+
def has?(element)
|
100
|
+
child?(element) || parent?(element)
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Is element a child?
|
105
|
+
def child?(element)
|
106
|
+
@child_keys.keys.any? { |key| @equivalence.call(key, element) }
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# Is element a parent?
|
111
|
+
def parent?(element)
|
112
|
+
@parent_keys.keys.any? { |key| @equivalence.call(key, element) }
|
113
|
+
end
|
114
|
+
|
115
|
+
##
|
116
|
+
# Delete child. Removes references to child in associated parents.
|
117
|
+
def delete_child(element)
|
118
|
+
return false unless child?(element)
|
119
|
+
c_key = get_child_key(element)
|
120
|
+
@child_keys[c_key][:parents].each do |parent|
|
121
|
+
@parent_keys[parent] -= [c_key]
|
122
|
+
end
|
123
|
+
@child_keys.delete c_key
|
124
|
+
true
|
125
|
+
end
|
126
|
+
|
127
|
+
##
|
128
|
+
# Delete parent. Removes references to parent in associated children.
|
129
|
+
def delete_parent(element)
|
130
|
+
return false unless parent?(element)
|
131
|
+
p_key = get_parent_key(element)
|
132
|
+
@parent_keys[p_key].each do |child|
|
133
|
+
@child_keys[child][:parents] -= [p_key]
|
134
|
+
end
|
135
|
+
@parent_keys.delete p_key
|
136
|
+
true
|
137
|
+
end
|
138
|
+
|
139
|
+
##
|
140
|
+
# Delete from parents and children. Essentially removes all known references.
|
141
|
+
def delete(element)
|
142
|
+
delete_child(element)
|
143
|
+
delete_parent(element)
|
144
|
+
end
|
145
|
+
|
146
|
+
##
|
147
|
+
# Finds children without parents. Returned as hash.
|
148
|
+
def orphans
|
149
|
+
@child_keys.select { |_key, info| info[:parents].empty? }
|
150
|
+
end
|
151
|
+
|
152
|
+
##
|
153
|
+
# Finds parents without children. Returned as hash.
|
154
|
+
def childless
|
155
|
+
@parent_keys.select { |_key, children| children.empty? }
|
156
|
+
end
|
157
|
+
|
158
|
+
private
|
159
|
+
|
160
|
+
##
|
161
|
+
# Get the child key (in case of alternative equivalence testing)
|
162
|
+
def get_child_key(child)
|
163
|
+
@child_keys.keys.find { |key| @equivalence.call(key, child) }
|
164
|
+
end
|
165
|
+
|
166
|
+
##
|
167
|
+
# Get the parent key (in case of alternative equivalence testing)
|
168
|
+
def get_parent_key(parent)
|
169
|
+
@parent_keys.keys.find { |key| @equivalence.call(key, parent) }
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
data/lib/recluse/info.rb
ADDED
data/lib/recluse/link.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
module Recluse
|
4
|
+
##
|
5
|
+
# Errors related to links.
|
6
|
+
class LinkError < RuntimeError
|
7
|
+
end
|
8
|
+
|
9
|
+
##
|
10
|
+
# A simple link container for a profile's queue.
|
11
|
+
class Link
|
12
|
+
##
|
13
|
+
# URL of link. Can be relative.
|
14
|
+
attr_reader :url
|
15
|
+
|
16
|
+
##
|
17
|
+
# Parent of link (i.e. the referrer). Can be +:root+ if no parent.
|
18
|
+
attr_reader :parent
|
19
|
+
|
20
|
+
##
|
21
|
+
# The absolute URL of the link.
|
22
|
+
attr_reader :absolute
|
23
|
+
|
24
|
+
##
|
25
|
+
# The +Addressable::URI+ representation of the link.
|
26
|
+
attr_reader :address
|
27
|
+
|
28
|
+
##
|
29
|
+
# Create a link.
|
30
|
+
def initialize(url, parent)
|
31
|
+
raise LinkError, 'Incorrect parent URL. Expects :root or a string.' unless parent == :root || parent.class == String
|
32
|
+
@url = url
|
33
|
+
@parent = parent
|
34
|
+
@address = @parent == :root ? Addressable::URI.parse(@url) : Addressable::URI.join(@parent, @url)
|
35
|
+
@address.fragment = nil
|
36
|
+
@absolute = @address.to_s
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Output as string.
|
41
|
+
def to_s
|
42
|
+
@absolute
|
43
|
+
end
|
44
|
+
|
45
|
+
##
|
46
|
+
# Is the link internal compared to +Addressable::URI+ roots?
|
47
|
+
def internal?(addrroots, scheme_squash: false)
|
48
|
+
return true if @parent == :root
|
49
|
+
return addrroots.any? { |root| Link.internal_to?(root, @address) } unless scheme_squash
|
50
|
+
a2 = @address.dup
|
51
|
+
a2.scheme = a2.scheme == 'https' ? 'http' : 'https'
|
52
|
+
addrroots.any? { |root| (Link.internal_to?(root, @address) || Link.internal_to?(root, a2)) }
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Is the link runnable compared to the black- and whitelists, and the link scheme?
|
57
|
+
def run?(blacklist, whitelist)
|
58
|
+
((@address.scheme == 'http') || (@address.scheme == 'https')) && (!match?(blacklist) || match?(whitelist))
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# Does the link match any of the globs?
|
63
|
+
def match?(globs)
|
64
|
+
[*globs].any? { |glob| File.fnmatch(glob, @absolute) }
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# Check if +to+ is internal compared to +root+. Building block of +internal?+. Both +root+ and +to+ must be of type +Addressable::URI+.
|
69
|
+
#
|
70
|
+
# A link is internal compared to the root if it matches the following conditions:
|
71
|
+
#
|
72
|
+
# - Same scheme, subdomain, and domain. In other words, a relative URL can be built out of the link.
|
73
|
+
# - If +root+ is a directory and doesn't contain a filename (e.g. +http://example.com/test/+):
|
74
|
+
# - Internal if link is below the root's path or is the same (e.g. +http://example.com/test/index.php+).
|
75
|
+
# - Otherwise if +root+ contains a filename (e.g. +http://example.com/test/index.php+):
|
76
|
+
# - Internal if link is below parent directory of root (e.g. +http://example.com/test/about.php+).
|
77
|
+
def self.internal_to?(root, to)
|
78
|
+
route = root.route_to(to)
|
79
|
+
return false if route == to # can't be represented as relative url
|
80
|
+
route_internal = route.to_s[0...3] != '../'
|
81
|
+
has_slash = root.path[-1] == '/'
|
82
|
+
return route_internal if has_slash || !root.extname.empty?
|
83
|
+
slashed_root = root.dup
|
84
|
+
slashed_root.path = "#{root.path}/"
|
85
|
+
slashed_route = slashed_root.route_to(to)
|
86
|
+
(slashed_route.to_s[0...3] != '../')
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|