wordlist 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +3 -0
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/{History.txt → ChangeLog.md} +5 -1
- data/LICENSE.txt +22 -0
- data/README.md +96 -0
- data/Rakefile +30 -17
- data/bin/wordlist +10 -0
- data/gemspec.yml +22 -0
- data/lib/wordlist/builder.rb +144 -25
- data/lib/wordlist/builders/website.rb +184 -12
- data/lib/wordlist/flat_file.rb +15 -4
- data/lib/wordlist/list.rb +63 -32
- data/lib/wordlist/mutator.rb +38 -9
- data/lib/wordlist/parsers.rb +24 -19
- data/lib/wordlist/runners.rb +2 -0
- data/lib/wordlist/runners/list.rb +116 -0
- data/lib/wordlist/runners/runner.rb +67 -0
- data/lib/wordlist/unique_filter.rb +47 -8
- data/lib/wordlist/version.rb +1 -1
- data/scripts/benchmark +43 -2
- data/spec/builder_examples.rb +46 -0
- data/spec/builder_spec.rb +97 -6
- data/spec/classes/parser_class.rb +2 -0
- data/spec/helpers/text.rb +6 -0
- data/spec/helpers/wordlist.rb +23 -0
- data/spec/spec_helper.rb +2 -4
- data/wordlist.gemspec +60 -0
- metadata +106 -62
- data/Manifest.txt +0 -30
- data/README.txt +0 -103
- data/tasks/spec.rb +0 -9
@@ -6,33 +6,205 @@ module Wordlist
|
|
6
6
|
module Builders
|
7
7
|
class Website < Builder
|
8
8
|
|
9
|
+
# Proxy to use
|
10
|
+
attr_accessor :proxy
|
11
|
+
|
12
|
+
# User-Agent to use
|
13
|
+
attr_accessor :user_agent
|
14
|
+
|
15
|
+
# Referer URL to use
|
16
|
+
attr_accessor :referer
|
17
|
+
|
9
18
|
# Host to spider
|
10
19
|
attr_accessor :host
|
11
20
|
|
21
|
+
# HTTP Host Header to use in all requests.
|
22
|
+
attr_accessor :host_header
|
23
|
+
|
24
|
+
# Additional hosts that can be spidered
|
25
|
+
attr_reader :hosts
|
26
|
+
|
27
|
+
# Links to ignore while spidering
|
28
|
+
attr_reader :ignore_links
|
29
|
+
|
30
|
+
# Specifies whether the `content` attribute of `meta` tags will be
|
31
|
+
# parsed
|
32
|
+
attr_accessor :parse_meta
|
33
|
+
|
34
|
+
# Specifies whether `title` tags will be parsed
|
35
|
+
attr_accessor :parse_title
|
36
|
+
|
37
|
+
# Specifies whether `h1` tags will be parsed
|
38
|
+
attr_accessor :parse_h1
|
39
|
+
|
40
|
+
# Specifies whether `h2` tags will be parsed
|
41
|
+
attr_accessor :parse_h2
|
42
|
+
|
43
|
+
# Specifies whether `h3` tags will be parsed
|
44
|
+
attr_accessor :parse_h3
|
45
|
+
|
46
|
+
# Specifies whether `h4` tags will be parsed
|
47
|
+
attr_accessor :parse_h4
|
48
|
+
|
49
|
+
# Specifies whether `h5` tags will be parsed
|
50
|
+
attr_accessor :parse_h5
|
51
|
+
|
52
|
+
# Specifies whether `p` tags will be parsed
|
53
|
+
attr_accessor :parse_p
|
54
|
+
|
55
|
+
# Specifies whether `span` tags will be parsed
|
56
|
+
attr_accessor :parse_span
|
57
|
+
|
58
|
+
# Specifies whether the `alt` attributes of `img` tags will be parsed
|
59
|
+
attr_accessor :parse_alt
|
60
|
+
|
61
|
+
# Specifies whether HTML comment tags will be parsed
|
62
|
+
attr_accessor :parse_comments
|
63
|
+
|
64
|
+
# Additional XPath expressions to use to parse spidered pages
|
65
|
+
attr_reader :xpaths
|
66
|
+
|
67
|
+
#
|
68
|
+
# Creates a new Website builder object.
|
69
|
+
#
|
70
|
+
# @param [String] path
|
71
|
+
# The path to the word-list to build.
|
72
|
+
#
|
73
|
+
# @param [Hash] options
|
74
|
+
# Additional options.
|
75
|
+
#
|
76
|
+
# @option options [Hash] :proxy
|
77
|
+
# The Hash of proxy information to use.
|
78
|
+
#
|
79
|
+
# @option options [String] :user_agent
|
80
|
+
# The User-Agent string to send with each request.
|
81
|
+
#
|
82
|
+
# @option options [String] :referer
|
83
|
+
# The Referer URL to send with each request.
|
12
84
|
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
# Website builder object.
|
85
|
+
# @option options [String] :host_header
|
86
|
+
# The HTTP Host header to use in all requests.
|
16
87
|
#
|
17
|
-
|
18
|
-
|
88
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
89
|
+
# Links to ignore while spidering.
|
90
|
+
#
|
91
|
+
# @option options [Boolean] :parse_meta (true)
|
92
|
+
# Specifies whether the `content` attribute of `meta` tags will be
|
93
|
+
# parsed.
|
94
|
+
#
|
95
|
+
# @option options [Boolean] :parse_title (true)
|
96
|
+
# Specifies whether `title` tags will be parsed.
|
97
|
+
#
|
98
|
+
# @option options [Boolean] :parse_h1 (true)
|
99
|
+
# Specifies whether `h1` tags will be parsed.
|
100
|
+
#
|
101
|
+
# @option options [Boolean] :parse_h2 (true)
|
102
|
+
# Specifies whether `h2` tags will be parsed.
|
103
|
+
#
|
104
|
+
# @option options [Boolean] :parse_h3 (true)
|
105
|
+
# Specifies whether `h3` tags will be parsed.
|
106
|
+
#
|
107
|
+
# @option options [Boolean] :parse_h4 (true)
|
108
|
+
# Specifies whether `h4` tags will be parsed.
|
109
|
+
#
|
110
|
+
# @option options [Boolean] :parse_h5 (true)
|
111
|
+
# Specifies whether `h5` tags will be parsed.
|
112
|
+
#
|
113
|
+
# @option options [Boolean] :parse_p (true)
|
114
|
+
# Specifies whether `p` tags will be parsed.
|
115
|
+
#
|
116
|
+
# @option options [Boolean] :parse_span (true)
|
117
|
+
# Specifies whether `span` tags will be parsed.
|
118
|
+
#
|
119
|
+
# @option options [Boolean] :parse_alt (true)
|
120
|
+
# Specifies whether the `alt` attributes of `img` tags will be
|
121
|
+
# parsed.
|
122
|
+
#
|
123
|
+
# @option options [Boolean] :parse_comments (false)
|
124
|
+
# Specifies whether HTML comment tags will be parsed.
|
125
|
+
#
|
126
|
+
# @option options [Array<String>] :xpaths
|
127
|
+
# Additional list of XPath expressions, to use when parsing
|
128
|
+
# spidered pages.
|
129
|
+
#
|
130
|
+
def initialize(path,options={},&block)
|
131
|
+
@proxy = options.fetch(:proxy,Spidr.proxy)
|
132
|
+
@user_agent = options[:user_agent]
|
133
|
+
@referer = options[:referer]
|
134
|
+
|
135
|
+
@host = options[:host]
|
136
|
+
@host_header = options[:host_header]
|
137
|
+
@hosts = Array(options[:hosts])
|
138
|
+
|
139
|
+
@ignore_links = Array(options[:ignore_links])
|
19
140
|
|
20
|
-
|
141
|
+
@parse_meta = options.fetch(:parse_meta,true)
|
142
|
+
@parse_title = options.fetch(:parse_title,true)
|
143
|
+
@parse_h1 = options.fetch(:parse_h1,true)
|
144
|
+
@parse_h2 = options.fetch(:parse_h2,true)
|
145
|
+
@parse_h3 = options.fetch(:parse_h3,true)
|
146
|
+
@parse_h4 = options.fetch(:parse_h4,true)
|
147
|
+
@parse_h5 = options.fetch(:parse_h5,true)
|
148
|
+
@parse_p = options.fetch(:parse_p,true)
|
149
|
+
@parse_span = options.fetch(:parse_span,true)
|
150
|
+
@parse_alt = options.fetch(:parse_alt,true)
|
151
|
+
@parse_comments = options.fetch(:parse_comments,false)
|
152
|
+
|
153
|
+
@xpaths = Array(options[:xpaths])
|
154
|
+
|
155
|
+
super(path,options,&block)
|
21
156
|
end
|
22
157
|
|
23
158
|
#
|
24
|
-
# Builds the word-list file by spidering the
|
25
|
-
# inner-text from all HTML pages.
|
26
|
-
#
|
159
|
+
# Builds the word-list file by spidering the `host` and parsing the
|
160
|
+
# inner-text from all HTML pages.
|
161
|
+
#
|
162
|
+
# @yield [builder]
|
163
|
+
# If a block is given, it will be called before all HTML pages on
|
164
|
+
# the `host` have been parsed.
|
165
|
+
#
|
166
|
+
# @yieldparam [Website] builder
|
167
|
+
# The website word-list builder.
|
27
168
|
#
|
28
169
|
def build!(&block)
|
29
170
|
super(&block)
|
30
171
|
|
31
|
-
|
172
|
+
options = {
|
173
|
+
:proxy => @proxy,
|
174
|
+
:user_agent => @user_agent,
|
175
|
+
:referer => @referer,
|
176
|
+
:hosts => @hosts,
|
177
|
+
:ignore_links => @ignore_links
|
178
|
+
}
|
179
|
+
|
180
|
+
xpaths = []
|
181
|
+
xpaths << '//meta/@content' if @parse_meta
|
182
|
+
xpaths << '//title' if @parse_title
|
183
|
+
xpaths << '//h1' if @parse_h1
|
184
|
+
xpaths << '//h2' if @parse_h2
|
185
|
+
xpaths << '//h3' if @parse_h3
|
186
|
+
xpaths << '//h4' if @parse_h4
|
187
|
+
xpaths << '//h5' if @parse_h5
|
188
|
+
xpaths << '//p' if @parse_p
|
189
|
+
xpaths << '//span' if @parse_span
|
190
|
+
xpaths << '//img/@alt' if @parse_alt
|
191
|
+
xpaths += @xpaths
|
192
|
+
|
193
|
+
Spidr.host(@host,options) do |spidr|
|
32
194
|
spidr.every_page do |page|
|
33
195
|
if page.html?
|
34
|
-
page.doc
|
35
|
-
|
196
|
+
if page.doc
|
197
|
+
xpaths.each do |xpath|
|
198
|
+
page.doc.search(xpath).each do |element|
|
199
|
+
parse(element.inner_text)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
if (@parse_comments && page.doc)
|
205
|
+
page.doc.traverse do |element|
|
206
|
+
parse(element.inner_text) if element.comment?
|
207
|
+
end
|
36
208
|
end
|
37
209
|
end
|
38
210
|
end
|
data/lib/wordlist/flat_file.rb
CHANGED
@@ -7,8 +7,13 @@ module Wordlist
|
|
7
7
|
attr_accessor :path
|
8
8
|
|
9
9
|
#
|
10
|
-
#
|
11
|
-
#
|
10
|
+
# Opens a new FlatFile list.
|
11
|
+
#
|
12
|
+
# @param [String] path
|
13
|
+
# The path to the flat file word-list read from.
|
14
|
+
#
|
15
|
+
# @param [Hash] options
|
16
|
+
# Additional options.
|
12
17
|
#
|
13
18
|
def initialize(path,options={},&block)
|
14
19
|
@path = path
|
@@ -17,9 +22,15 @@ module Wordlist
|
|
17
22
|
end
|
18
23
|
|
19
24
|
#
|
20
|
-
# Enumerates through every word in the flat-file
|
21
|
-
#
|
25
|
+
# Enumerates through every word in the flat-file.
|
26
|
+
#
|
27
|
+
# @yield [word]
|
28
|
+
# The given block will be passed every word from the word-list.
|
29
|
+
#
|
30
|
+
# @yieldparam [String] word
|
31
|
+
# A word from the word-list.
|
22
32
|
#
|
33
|
+
# @example
|
23
34
|
# flat_file.each_word do |word|
|
24
35
|
# puts word
|
25
36
|
# end
|
data/lib/wordlist/list.rb
CHANGED
@@ -13,42 +13,55 @@ module Wordlist
|
|
13
13
|
attr_accessor :min_length
|
14
14
|
|
15
15
|
#
|
16
|
-
# Creates a new List object
|
17
|
-
# is given, it will be passed the newly created List object.
|
16
|
+
# Creates a new List object.
|
18
17
|
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# list.
|
22
|
-
# <tt>:min_length</tt>:: The minimum length of words produced by the
|
23
|
-
# list.
|
18
|
+
# @param [Hash] options
|
19
|
+
# Additional options.
|
24
20
|
#
|
25
|
-
|
21
|
+
# @option options [Integer] :max_length
|
22
|
+
# The maximum length of words produced by the list.
|
23
|
+
#
|
24
|
+
# @option options [Integer] :min_length
|
25
|
+
# The minimum length of words produced by the list.
|
26
|
+
#
|
27
|
+
# @yield [list]
|
28
|
+
# If a block is given, it will be passed the new list object.
|
29
|
+
#
|
30
|
+
# @yieldparam [List] list
|
31
|
+
# The new list object.
|
32
|
+
#
|
33
|
+
def initialize(options={})
|
26
34
|
@mutators = []
|
27
35
|
|
28
|
-
@max_length =
|
29
|
-
@min_length = 0
|
30
|
-
|
31
|
-
if options[:max_length]
|
32
|
-
@max_length = options[:max_length]
|
33
|
-
end
|
34
|
-
|
35
|
-
if options[:min_length]
|
36
|
-
@min_length = options[:min_length]
|
37
|
-
end
|
36
|
+
@max_length = options[:max_length]
|
37
|
+
@min_length = options.fetch(:min_length,0)
|
38
38
|
|
39
|
-
|
39
|
+
yield self if block_given?
|
40
40
|
end
|
41
41
|
|
42
42
|
#
|
43
|
-
# Adds a mutation rule for the specified
|
44
|
-
# using the specified
|
45
|
-
# _substitute_ data omitted, then the _block_ will be used to
|
46
|
-
# replace data matched by the _pattern_.
|
43
|
+
# Adds a mutation rule for the specified pattern, to be replaced
|
44
|
+
# using the specified substitute.
|
47
45
|
#
|
48
|
-
#
|
46
|
+
# @param [String, Regexp] pattern
|
47
|
+
# The pattern to recognize text to mutate.
|
49
48
|
#
|
50
|
-
#
|
49
|
+
# @param [String, Integer, nil] substitute
|
50
|
+
# The optional text to replace recognized text.
|
51
51
|
#
|
52
|
+
# @yield [match]
|
53
|
+
# If a block is given, it will be passed the recognized text to be
|
54
|
+
# mutated. The return value of the block will be used to replace
|
55
|
+
# the recognized text.
|
56
|
+
#
|
57
|
+
# @yieldparam [String] match
|
58
|
+
# The recognized text to be mutated.
|
59
|
+
#
|
60
|
+
# @example
|
61
|
+
# list.mutate 'o', '0'
|
62
|
+
#
|
63
|
+
# list.mutate '0', 0x41
|
64
|
+
#
|
52
65
|
# list.mutate(/[oO]/) do |match|
|
53
66
|
# match.swapcase
|
54
67
|
# end
|
@@ -58,10 +71,15 @@ module Wordlist
|
|
58
71
|
end
|
59
72
|
|
60
73
|
#
|
61
|
-
# Enumerate through every word in the list
|
62
|
-
#
|
63
|
-
#
|
74
|
+
# Enumerate through every word in the list.
|
75
|
+
#
|
76
|
+
# @yield [word]
|
77
|
+
# The given block will be passed each word in the list.
|
64
78
|
#
|
79
|
+
# @yieldparam [String] word
|
80
|
+
# A word from the list.
|
81
|
+
#
|
82
|
+
# @example
|
65
83
|
# list.each_word do |word|
|
66
84
|
# puts word
|
67
85
|
# end
|
@@ -70,9 +88,15 @@ module Wordlist
|
|
70
88
|
end
|
71
89
|
|
72
90
|
#
|
73
|
-
# Enumerates through every unique word in the list
|
74
|
-
#
|
91
|
+
# Enumerates through every unique word in the list.
|
92
|
+
#
|
93
|
+
# @yield [word]
|
94
|
+
# The given block will be passed each unique word in the list.
|
95
|
+
#
|
96
|
+
# @yieldparam [String] word
|
97
|
+
# A unique word from the list.
|
75
98
|
#
|
99
|
+
# @example
|
76
100
|
# list.each_unique do |word|
|
77
101
|
# puts word
|
78
102
|
# end
|
@@ -91,9 +115,16 @@ module Wordlist
|
|
91
115
|
|
92
116
|
#
|
93
117
|
# Enumerates through every unique mutation, of every unique word, using
|
94
|
-
# the mutator rules define for the list.
|
95
|
-
#
|
118
|
+
# the mutator rules define for the list.
|
119
|
+
#
|
120
|
+
# @yield [word]
|
121
|
+
# The given block will be passed every mutation of every unique
|
122
|
+
# word in the list.
|
123
|
+
#
|
124
|
+
# @yieldparam [String] word
|
125
|
+
# A mutation of a unique word from the list.
|
96
126
|
#
|
127
|
+
# @example
|
97
128
|
# list.each_mutation do |word|
|
98
129
|
# puts word
|
99
130
|
# end
|
data/lib/wordlist/mutator.rb
CHANGED
@@ -10,18 +10,33 @@ module Wordlist
|
|
10
10
|
attr_accessor :substitute
|
11
11
|
|
12
12
|
#
|
13
|
-
# Creates a new Mutator
|
14
|
-
#
|
15
|
-
#
|
13
|
+
# Creates a new Mutator object.
|
14
|
+
#
|
15
|
+
# @param [String, Regexp] pattern
|
16
|
+
# The pattern which recognizes text to mutate.
|
17
|
+
#
|
18
|
+
# @param [String, Integer] substitute
|
19
|
+
# The optional text to replace recognized text.
|
20
|
+
#
|
21
|
+
# @yield [match]
|
22
|
+
# If a block is given, it will be used to mutate recognized text.
|
23
|
+
#
|
24
|
+
# @yieldparam [String] match
|
25
|
+
# The match text to mutate.
|
16
26
|
#
|
17
27
|
def initialize(pattern,substitute=nil,&block)
|
18
|
-
@pattern
|
28
|
+
@pattern = pattern
|
19
29
|
@substitute = (substitute || block)
|
20
30
|
end
|
21
31
|
|
22
32
|
#
|
23
|
-
#
|
24
|
-
#
|
33
|
+
# Mutates the given text.
|
34
|
+
#
|
35
|
+
# @param [String] matched
|
36
|
+
# The recognized text to be mutated.
|
37
|
+
#
|
38
|
+
# @return [String]
|
39
|
+
# The mutated text.
|
25
40
|
#
|
26
41
|
def replace(matched)
|
27
42
|
result = if @substitute.kind_of?(Proc)
|
@@ -40,9 +55,20 @@ module Wordlist
|
|
40
55
|
end
|
41
56
|
|
42
57
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
58
|
+
# Enumerates over every possible mutation of the given word.
|
59
|
+
#
|
60
|
+
# @param [String] word
|
61
|
+
# The word to mutate.
|
62
|
+
#
|
63
|
+
# @yield [mutation]
|
64
|
+
# The given block will be passed every possible mutation of the
|
65
|
+
# given word.
|
66
|
+
#
|
67
|
+
# @yieldparam [String] mutation
|
68
|
+
# One possible mutation of the given word.
|
69
|
+
#
|
70
|
+
# @return [String]
|
71
|
+
# The original word.
|
46
72
|
#
|
47
73
|
def each(word)
|
48
74
|
choices = 0
|
@@ -76,6 +102,9 @@ module Wordlist
|
|
76
102
|
#
|
77
103
|
# Inspects the mutator.
|
78
104
|
#
|
105
|
+
# @return [String]
|
106
|
+
# The inspected mutator.
|
107
|
+
#
|
79
108
|
def inspect
|
80
109
|
"#{@pattern.inspect} -> #{@substitute.inspect}"
|
81
110
|
end
|