recluse 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ require 'thor'
2
+ require 'recluse/profile'
3
+ require 'recluse/cli/whitelist'
4
+ require 'recluse/cli/blacklist'
5
+ require 'recluse/cli/roots'
6
+ require 'user_config'
7
+
8
+ module Recluse
9
+ module CLI
10
+ ##
11
+ # Commands to edit/create/delete profiles.
12
+ class Profile < Thor #:nodoc: all
13
+ method_option :blacklist, type: :array, desc: 'Glob patterns for URLs to ignore', default: []
14
+ method_option :whitelist, type: :array, desc: 'Glob pattern exceptions to blacklist', default: []
15
+ method_option :internal_only, type: :boolean, desc: 'Only check internal URLs', default: false
16
+ method_option :scheme_squash, type: :boolean, desc: 'HTTP and HTTPS URLs are treated as equals', default: false
17
+ method_option :redirect, type: :boolean, desc: 'Follow redirects and report final status code', default: false
18
+ desc 'create [options] name email root1 [root2] ...', 'create profile'
19
+ def create(name, email, *roots)
20
+ uconf = UserConfig.new '.recluse'
21
+ if uconf.exist?("#{name}.yaml")
22
+ puts "Profile #{name} already exists"
23
+ exit(-1)
24
+ end
25
+ begin
26
+ profile = Recluse::Profile.new(
27
+ name,
28
+ roots,
29
+ email,
30
+ blacklist: options['blacklist'],
31
+ whitelist: options['whitelist'],
32
+ internal_only: options['internal_only'],
33
+ scheme_squash: options['scheme_squash'],
34
+ redirect: options['redirect']
35
+ )
36
+ profile.save
37
+ rescue ProfileError => e
38
+ puts e
39
+ exit(-1)
40
+ end
41
+ end
42
+ desc 'profile remove name', 'remove profile'
43
+ def remove(name)
44
+ uconf = UserConfig.new '.recluse'
45
+ if uconf.exist?("#{name}.yaml")
46
+ uconf.delete "#{name}.yaml"
47
+ else
48
+ exit(-1)
49
+ end
50
+ end
51
+ method_option :blacklist, type: :array, desc: 'Glob patterns for URLs to ignore'
52
+ method_option :whitelist, type: :array, desc: 'Glob pattern exceptions to blacklist'
53
+ method_option :internal_only, type: :boolean, desc: 'Only check internal URLs'
54
+ method_option :scheme_squash, type: :boolean, desc: 'HTTP and HTTPS URLs are treated as equals'
55
+ method_option :roots, type: :array, desc: 'Roots to start the spidering at'
56
+ method_option :email, type: :string, desc: 'Email to identify spider for system admins'
57
+ method_option :redirect, type: :boolean, desc: 'Follow redirects and report final status code'
58
+ desc 'edit name [options]', 'edit profile'
59
+ def edit(name)
60
+ begin
61
+ profile = Recluse::Profile.load name
62
+ rescue ProfileError => e
63
+ puts e
64
+ exit(-1)
65
+ end
66
+ profile.roots = options['roots'] if options.key? 'roots'
67
+ profile.blacklist = options['blacklist'] if options.key? 'blacklist'
68
+ profile.whitelist = options['whitelist'] if options.key? 'whitelist'
69
+ profile.internal_only = options['internal_only'] if options.key? 'internal_only'
70
+ profile.scheme_squash = options['scheme_squash'] if options.key? 'scheme_squash'
71
+ profile.redirect = options['redirect'] if options.key? 'redirect'
72
+ profile.email = options['email'] if options.key? 'email'
73
+ profile.save
74
+ end
75
+ desc 'rename old_name new_name', 'rename profile'
76
+ def rename(old_name, new_name)
77
+ uconf = UserConfig.new '.recluse'
78
+ if uconf.exist?("#{new_name}.yaml")
79
+ puts "Profile #{new_name} already exists"
80
+ exit(-1)
81
+ end
82
+ return unless uconf.exist?("#{old_name}.yaml")
83
+ old_profile = uconf["#{old_name}.yaml"]
84
+ old_profile['name'] = new_name
85
+ new_profile = uconf["#{new_name}.yaml"]
86
+ old_profile.each do |key, value|
87
+ new_profile[key] = value
88
+ end
89
+ new_profile.save
90
+ uconf.delete "#{old_name}.yaml"
91
+ end
92
+ desc 'list', 'list profiles'
93
+ def list
94
+ uconf = UserConfig.new '.recluse'
95
+ files = uconf.list_in_directory '.'
96
+ files.each do |file|
97
+ puts file.gsub(/\.yaml$/, '')
98
+ end
99
+ end
100
+ desc 'info name', 'profile information'
101
+ def info(name)
102
+ uconf = UserConfig.new '.recluse'
103
+ unless uconf.exist?("#{name}.yaml")
104
+ puts "Profile #{name} doesn't exist"
105
+ exit(-1)
106
+ end
107
+ puts uconf["#{name}.yaml"].to_yaml
108
+ end
109
+ desc 'blacklist [subcommand] [options]', 'edit blacklist'
110
+ subcommand 'blacklist', Blacklist
111
+ desc 'roots [subcommand] [options]', 'edit roots'
112
+ subcommand 'roots', Roots
113
+ desc 'whitelist [subcommand] [options]', 'edit whitelist'
114
+ subcommand 'whitelist', Whitelist
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,59 @@
1
+ require 'thor'
2
+ require 'user_config'
3
+
4
+ module Recluse
5
+ module CLI
6
+ ##
7
+ # Roots related commands.
8
+ class Roots < Thor #:nodoc: all
9
+ desc 'add profile pattern1 [pattern2] ...', 'add to roots'
10
+ def add(name, *roots)
11
+ uconf = UserConfig.new '.recluse'
12
+ unless uconf.exist?("#{name}.yaml")
13
+ puts "Profile #{name} doesn't exist"
14
+ exit(-1)
15
+ end
16
+ profile = uconf["#{name}.yaml"]
17
+ if profile.key?('roots')
18
+ profile['roots'] += roots
19
+ else
20
+ profile['roots'] = roots
21
+ end
22
+ profile.save
23
+ end
24
+ desc 'remove profile pattern1 [pattern2] ...', 'remove from roots'
25
+ def remove(name, *roots)
26
+ uconf = UserConfig.new '.recluse'
27
+ unless uconf.exist?("#{name}.yaml")
28
+ puts "Profile #{name} doesn't exist"
29
+ exit(-1)
30
+ end
31
+ profile = uconf["#{name}.yaml"]
32
+ return unless profile.key?('roots')
33
+ profile['roots'] -= roots
34
+ profile.save
35
+ end
36
+ desc 'clear profile', 'remove all roots'
37
+ def clear(name)
38
+ uconf = UserConfig.new '.recluse'
39
+ unless uconf.exist?("#{name}.yaml")
40
+ puts "Profile #{name} doesn't exist"
41
+ exit(-1)
42
+ end
43
+ profile = uconf["#{name}.yaml"]
44
+ profile['roots'] = []
45
+ profile.save
46
+ end
47
+ desc 'list profile', 'list roots'
48
+ def list(name)
49
+ uconf = UserConfig.new '.recluse'
50
+ unless uconf.exist?("#{name}.yaml")
51
+ puts "Profile #{name} doesn't exist"
52
+ exit(-1)
53
+ end
54
+ profile = uconf["#{name}.yaml"]
55
+ profile['roots'].each { |root| puts root } if profile.key?('roots')
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,59 @@
1
+ require 'thor'
2
+ require 'user_config'
3
+
4
+ module Recluse
5
+ module CLI
6
+ ##
7
+ # Whitelist related commands.
8
+ class Whitelist < Thor #:nodoc: all
9
+ desc 'add profile pattern1 [pattern2] ...', 'add glob patterns to whitelist'
10
+ def add(name, *patterns)
11
+ uconf = UserConfig.new '.recluse'
12
+ unless uconf.exist?("#{name}.yaml")
13
+ puts "Profile #{name} doesn't exist"
14
+ exit(-1)
15
+ end
16
+ profile = uconf["#{name}.yaml"]
17
+ if profile.key?('whitelist')
18
+ profile['whitelist'] += patterns
19
+ else
20
+ profile['whitelist'] = patterns
21
+ end
22
+ profile.save
23
+ end
24
+ desc 'remove profile pattern1 [pattern2] ...', 'remove patterns from whitelist'
25
+ def remove(name, *patterns)
26
+ uconf = UserConfig.new '.recluse'
27
+ unless uconf.exist?("#{name}.yaml")
28
+ puts "Profile #{name} doesn't exist"
29
+ exit(-1)
30
+ end
31
+ profile = uconf["#{name}.yaml"]
32
+ return unless profile.key?('whitelist')
33
+ profile['whitelist'] -= patterns
34
+ profile.save
35
+ end
36
+ desc 'clear profile', 'remove all patterns in the whitelist'
37
+ def clear(name)
38
+ uconf = UserConfig.new '.recluse'
39
+ unless uconf.exist?("#{name}.yaml")
40
+ puts "Profile #{name} doesn't exist"
41
+ exit(-1)
42
+ end
43
+ profile = uconf["#{name}.yaml"]
44
+ profile['whitelist'] = []
45
+ profile.save
46
+ end
47
+ desc 'list profile', 'list patterns in whitelist'
48
+ def list(name)
49
+ uconf = UserConfig.new '.recluse'
50
+ unless uconf.exist?("#{name}.yaml")
51
+ puts "Profile #{name} doesn't exist"
52
+ exit(-1)
53
+ end
54
+ profile = uconf["#{name}.yaml"]
55
+ profile['whitelist'].each { |pattern| puts pattern } if profile.key?('whitelist')
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,172 @@
1
+ module Recluse
2
+ ##
3
+ # Sorta like a node tree but using two hashes for easy searching for parents and/or children.
4
+ # This way, it should have similar performance whether you're iterating over parents or children.
5
+ # Additionally, not every child will need a parent or they might not need a parent at initialization.
6
+ class HashTree
7
+ ##
8
+ # Create a hash tree.
9
+ def initialize(&block)
10
+ @parent_keys = {}
11
+ @child_keys = {}
12
+ @equivalence = block.nil? ? (proc { |a, b| a == b }) : block
13
+ end
14
+
15
+ ##
16
+ # Add child associated with parent(s).
17
+ def add(child, parents)
18
+ unless child?(child)
19
+ @child_keys[child] = {
20
+ value: nil,
21
+ parents: []
22
+ }
23
+ end
24
+ @child_keys[get_child_key(child)][:parents] += [*parents]
25
+ [*parents].each do |parent|
26
+ @parent_keys[parent] = [] unless parent?(parent)
27
+ @parent_keys[get_parent_key(parent)] << get_child_key(child)
28
+ end
29
+ end
30
+
31
+ ##
32
+ # Add parent with no children.
33
+ def add_parent(parents)
34
+ [*parents].each do |parent|
35
+ @parent_keys[parent] = [] unless parent?(parent)
36
+ end
37
+ end
38
+
39
+ ##
40
+ # Add child with no value and no parents.
41
+ def add_child(children)
42
+ [*children].each do |child|
43
+ next if child?(child)
44
+ @child_keys[child] = {
45
+ value: nil,
46
+ parents: []
47
+ }
48
+ end
49
+ end
50
+
51
+ ##
52
+ # Set value of child.
53
+ def set_child_value(child, value)
54
+ @child_keys[get_child_key(child)][:value] = value
55
+ end
56
+
57
+ ##
58
+ # Get value of child.
59
+ def get_child_value(child)
60
+ @child_keys[get_child_key(child)][:value]
61
+ end
62
+
63
+ ##
64
+ # Get child's parents
65
+ def get_parents(child)
66
+ @child_keys[get_child_key(child)][:parents]
67
+ end
68
+
69
+ ##
70
+ # Get parent's children
71
+ def get_children(parent)
72
+ @parent_keys[get_parent_key(parent)]
73
+ end
74
+
75
+ ##
76
+ # Collect values of children for parent.
77
+ def get_values(parent)
78
+ vals = {}
79
+ @parent_keys[get_parent_key(parent)].each do |child|
80
+ vals[child] = @child_keys[child][:value]
81
+ end
82
+ vals
83
+ end
84
+
85
+ ##
86
+ # Get parents hash.
87
+ def parents
88
+ @parent_keys.dup
89
+ end
90
+
91
+ ##
92
+ # Get children hash.
93
+ def children
94
+ @child_keys.dup
95
+ end
96
+
97
+ ##
98
+ # Does element exist as a child and/or parent key?
99
+ def has?(element)
100
+ child?(element) || parent?(element)
101
+ end
102
+
103
+ ##
104
+ # Is element a child?
105
+ def child?(element)
106
+ @child_keys.keys.any? { |key| @equivalence.call(key, element) }
107
+ end
108
+
109
+ ##
110
+ # Is element a parent?
111
+ def parent?(element)
112
+ @parent_keys.keys.any? { |key| @equivalence.call(key, element) }
113
+ end
114
+
115
+ ##
116
+ # Delete child. Removes references to child in associated parents.
117
+ def delete_child(element)
118
+ return false unless child?(element)
119
+ c_key = get_child_key(element)
120
+ @child_keys[c_key][:parents].each do |parent|
121
+ @parent_keys[parent] -= [c_key]
122
+ end
123
+ @child_keys.delete c_key
124
+ true
125
+ end
126
+
127
+ ##
128
+ # Delete parent. Removes references to parent in associated children.
129
+ def delete_parent(element)
130
+ return false unless parent?(element)
131
+ p_key = get_parent_key(element)
132
+ @parent_keys[p_key].each do |child|
133
+ @child_keys[child][:parents] -= [p_key]
134
+ end
135
+ @parent_keys.delete p_key
136
+ true
137
+ end
138
+
139
+ ##
140
+ # Delete from parents and children. Essentially removes all known references.
141
+ def delete(element)
142
+ delete_child(element)
143
+ delete_parent(element)
144
+ end
145
+
146
+ ##
147
+ # Finds children without parents. Returned as hash.
148
+ def orphans
149
+ @child_keys.select { |_key, info| info[:parents].empty? }
150
+ end
151
+
152
+ ##
153
+ # Finds parents without children. Returned as hash.
154
+ def childless
155
+ @parent_keys.select { |_key, children| children.empty? }
156
+ end
157
+
158
+ private
159
+
160
+ ##
161
+ # Get the child key (in case of alternative equivalence testing)
162
+ def get_child_key(child)
163
+ @child_keys.keys.find { |key| @equivalence.call(key, child) }
164
+ end
165
+
166
+ ##
167
+ # Get the parent key (in case of alternative equivalence testing)
168
+ def get_parent_key(parent)
169
+ @parent_keys.keys.find { |key| @equivalence.call(key, parent) }
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,9 @@
1
+ module Recluse
2
+ ##
3
+ # Version of the gem.
4
+ VERSION = '1.0.0'.freeze
5
+
6
+ ##
7
+ # Homepage of the gem (also used in user-agent).
8
+ URL = 'https://github.com/czycha/recluse'.freeze
9
+ end
@@ -0,0 +1,89 @@
1
+ require 'addressable/uri'
2
+
3
+ module Recluse
4
+ ##
5
+ # Errors related to links.
6
+ class LinkError < RuntimeError
7
+ end
8
+
9
+ ##
10
+ # A simple link container for a profile's queue.
11
+ class Link
12
+ ##
13
+ # URL of link. Can be relative.
14
+ attr_reader :url
15
+
16
+ ##
17
+ # Parent of link (i.e. the referrer). Can be +:root+ if no parent.
18
+ attr_reader :parent
19
+
20
+ ##
21
+ # The absolute URL of the link.
22
+ attr_reader :absolute
23
+
24
+ ##
25
+ # The +Addressable::URI+ representation of the link.
26
+ attr_reader :address
27
+
28
+ ##
29
+ # Create a link.
30
+ def initialize(url, parent)
31
+ raise LinkError, 'Incorrect parent URL. Expects :root or a string.' unless parent == :root || parent.class == String
32
+ @url = url
33
+ @parent = parent
34
+ @address = @parent == :root ? Addressable::URI.parse(@url) : Addressable::URI.join(@parent, @url)
35
+ @address.fragment = nil
36
+ @absolute = @address.to_s
37
+ end
38
+
39
+ ##
40
+ # Output as string.
41
+ def to_s
42
+ @absolute
43
+ end
44
+
45
+ ##
46
+ # Is the link internal compared to +Addressable::URI+ roots?
47
+ def internal?(addrroots, scheme_squash: false)
48
+ return true if @parent == :root
49
+ return addrroots.any? { |root| Link.internal_to?(root, @address) } unless scheme_squash
50
+ a2 = @address.dup
51
+ a2.scheme = a2.scheme == 'https' ? 'http' : 'https'
52
+ addrroots.any? { |root| (Link.internal_to?(root, @address) || Link.internal_to?(root, a2)) }
53
+ end
54
+
55
+ ##
56
+ # Is the link runnable compared to the black- and whitelists, and the link scheme?
57
+ def run?(blacklist, whitelist)
58
+ ((@address.scheme == 'http') || (@address.scheme == 'https')) && (!match?(blacklist) || match?(whitelist))
59
+ end
60
+
61
+ ##
62
+ # Does the link match any of the globs?
63
+ def match?(globs)
64
+ [*globs].any? { |glob| File.fnmatch(glob, @absolute) }
65
+ end
66
+
67
+ ##
68
+ # Check if +to+ is internal compared to +root+. Building block of +internal?+. Both +root+ and +to+ must be of type +Addressable::URI+.
69
+ #
70
+ # A link is internal compared to the root if it matches the following conditions:
71
+ #
72
+ # - Same scheme, subdomain, and domain. In other words, a relative URL can be built out of the link.
73
+ # - If +root+ is a directory and doesn't contain a filename (e.g. +http://example.com/test/+):
74
+ # - Internal if link is below the root's path or is the same (e.g. +http://example.com/test/index.php+).
75
+ # - Otherwise if +root+ contains a filename (e.g. +http://example.com/test/index.php+):
76
+ # - Internal if link is below parent directory of root (e.g. +http://example.com/test/about.php+).
77
+ def self.internal_to?(root, to)
78
+ route = root.route_to(to)
79
+ return false if route == to # can't be represented as relative url
80
+ route_internal = route.to_s[0...3] != '../'
81
+ has_slash = root.path[-1] == '/'
82
+ return route_internal if has_slash || !root.extname.empty?
83
+ slashed_root = root.dup
84
+ slashed_root.path = "#{root.path}/"
85
+ slashed_route = slashed_root.route_to(to)
86
+ (slashed_route.to_s[0...3] != '../')
87
+ end
88
+ end
89
+ end