wordlist 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +3 -0
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/{History.txt → ChangeLog.md} +5 -1
- data/LICENSE.txt +22 -0
- data/README.md +96 -0
- data/Rakefile +30 -17
- data/bin/wordlist +10 -0
- data/gemspec.yml +22 -0
- data/lib/wordlist/builder.rb +144 -25
- data/lib/wordlist/builders/website.rb +184 -12
- data/lib/wordlist/flat_file.rb +15 -4
- data/lib/wordlist/list.rb +63 -32
- data/lib/wordlist/mutator.rb +38 -9
- data/lib/wordlist/parsers.rb +24 -19
- data/lib/wordlist/runners.rb +2 -0
- data/lib/wordlist/runners/list.rb +116 -0
- data/lib/wordlist/runners/runner.rb +67 -0
- data/lib/wordlist/unique_filter.rb +47 -8
- data/lib/wordlist/version.rb +1 -1
- data/scripts/benchmark +43 -2
- data/spec/builder_examples.rb +46 -0
- data/spec/builder_spec.rb +97 -6
- data/spec/classes/parser_class.rb +2 -0
- data/spec/helpers/text.rb +6 -0
- data/spec/helpers/wordlist.rb +23 -0
- data/spec/spec_helper.rb +2 -4
- data/wordlist.gemspec +60 -0
- metadata +106 -62
- data/Manifest.txt +0 -30
- data/README.txt +0 -103
- data/tasks/spec.rb +0 -9
data/lib/wordlist/parsers.rb
CHANGED
@@ -1,34 +1,39 @@
|
|
1
1
|
module Wordlist
|
2
2
|
module Parsers
|
3
|
-
|
4
|
-
|
5
|
-
# Ignore case of parsed text
|
6
|
-
attr_accessor :ignore_case
|
3
|
+
# Ignore case of parsed text
|
4
|
+
attr_accessor :ignore_case
|
7
5
|
|
8
|
-
|
9
|
-
|
6
|
+
# Ignore the punctuation of parsed text
|
7
|
+
attr_accessor :ignore_punctuation
|
10
8
|
|
11
|
-
|
12
|
-
|
9
|
+
# Ignore URLs
|
10
|
+
attr_accessor :ignore_urls
|
13
11
|
|
14
|
-
|
15
|
-
|
12
|
+
# Ignore Phone numbers
|
13
|
+
attr_accessor :ignore_phone_numbers
|
16
14
|
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
end
|
15
|
+
# Ignore References
|
16
|
+
attr_accessor :ignore_references
|
21
17
|
|
18
|
+
#
|
19
|
+
# Initializes the parsers settings.
|
20
|
+
#
|
22
21
|
def initialize
|
23
|
-
@ignore_case
|
24
|
-
@ignore_punctuation
|
25
|
-
@ignore_urls
|
22
|
+
@ignore_case = false
|
23
|
+
@ignore_punctuation = true
|
24
|
+
@ignore_urls = true
|
26
25
|
@ignore_phone_numbers = false
|
27
|
-
@ignore_references
|
26
|
+
@ignore_references = false
|
28
27
|
end
|
29
28
|
|
30
29
|
#
|
31
|
-
# Parses the
|
30
|
+
# Parses the given text.
|
31
|
+
#
|
32
|
+
# @param [String] text
|
33
|
+
# The text to parse.
|
34
|
+
#
|
35
|
+
# @return [Array<String>]
|
36
|
+
# The Array of parsed tokens.
|
32
37
|
#
|
33
38
|
def parse(text)
|
34
39
|
text = text.to_s
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'wordlist/runners/runner'
|
2
|
+
require 'wordlist/flat_file'
|
3
|
+
|
4
|
+
module Wordlist
|
5
|
+
module Runners
|
6
|
+
class List < Runner
|
7
|
+
|
8
|
+
#
|
9
|
+
# Creates a new List Runner.
|
10
|
+
#
|
11
|
+
def initialize
|
12
|
+
@file = nil
|
13
|
+
@min_length = nil
|
14
|
+
@max_length = nil
|
15
|
+
@mutations = []
|
16
|
+
|
17
|
+
@words = false
|
18
|
+
@unique_words = false
|
19
|
+
|
20
|
+
@output = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Runs the list runner.
|
25
|
+
#
|
26
|
+
# @param [Array<String>] args
|
27
|
+
# Arguments to run the runner with.
|
28
|
+
#
|
29
|
+
def run(*args)
|
30
|
+
super(*args)
|
31
|
+
|
32
|
+
list = if @file
|
33
|
+
FlatFile.new(
|
34
|
+
@file,
|
35
|
+
:min_length => @min_length,
|
36
|
+
:max_length => @max_length
|
37
|
+
)
|
38
|
+
else
|
39
|
+
print_error('the --file option must be specified')
|
40
|
+
exit -1
|
41
|
+
end
|
42
|
+
|
43
|
+
@mutations.each do |pattern,substitute|
|
44
|
+
list.mutate(pattern,substitute)
|
45
|
+
end
|
46
|
+
|
47
|
+
words = lambda { |output|
|
48
|
+
puts = output.method(:puts)
|
49
|
+
|
50
|
+
if @unique_words
|
51
|
+
list.each_unique(&puts)
|
52
|
+
elsif @words
|
53
|
+
list.each_word(&puts)
|
54
|
+
else
|
55
|
+
list.each(&puts)
|
56
|
+
end
|
57
|
+
}
|
58
|
+
|
59
|
+
if @output
|
60
|
+
File.open(@output,'w+',&words)
|
61
|
+
else
|
62
|
+
words.call(Kernel)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
#
|
69
|
+
# Parses the given arguments.
|
70
|
+
#
|
71
|
+
# @param [Array<String>] args
|
72
|
+
# Arguments to parse.
|
73
|
+
#
|
74
|
+
def optparse(*args)
|
75
|
+
super(*args) do |opts|
|
76
|
+
opts.banner = 'usage: wordlist [options]'
|
77
|
+
|
78
|
+
opts.on('-f','--file FILE','The wordlist file to list') do |file|
|
79
|
+
@file = file
|
80
|
+
end
|
81
|
+
|
82
|
+
opts.on('--min-length NUM','Minimum length of words in characters') do |min|
|
83
|
+
@min_length = min
|
84
|
+
end
|
85
|
+
|
86
|
+
opts.on('--max-length NUM','Maximum length of words in characters') do |max|
|
87
|
+
@max_length = max
|
88
|
+
end
|
89
|
+
|
90
|
+
opts.on('-m','--mutate SUBSTRING::REPLACE','Adds a mutation rule') do |substring_and_replace|
|
91
|
+
@mutations << substring_and_replace.split('::',2)
|
92
|
+
end
|
93
|
+
|
94
|
+
opts.on('-M','--mutate-pattern PATTERN::REPLACE','Adds a mutation rule') do |pattern_and_replace|
|
95
|
+
pattern, replace = substring_and_replace.split('::',2)
|
96
|
+
|
97
|
+
@mutations << [Regexp.new(pattern), replace]
|
98
|
+
end
|
99
|
+
|
100
|
+
opts.on('-w','--words','Only print the words in the wordlist') do
|
101
|
+
@words = true
|
102
|
+
end
|
103
|
+
|
104
|
+
opts.on('-u','--unique','Only print the unique words in the wordlist') do
|
105
|
+
@unique_words = true
|
106
|
+
end
|
107
|
+
|
108
|
+
opts.on('-o','--output FILE','Optional file to output the wordlist to') do |file|
|
109
|
+
@output = File.expand_path(file)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module Wordlist
|
4
|
+
module Runners
|
5
|
+
class Runner
|
6
|
+
#
|
7
|
+
# Creates and runs the runner with the given arguments.
|
8
|
+
#
|
9
|
+
# @param [Array<String>] args
|
10
|
+
# Arguments to parse.
|
11
|
+
#
|
12
|
+
def self.run(*args)
|
13
|
+
runner = new()
|
14
|
+
runner.run(*args)
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Runs the runner with the given arguments.
|
19
|
+
#
|
20
|
+
# @param [Array<String>] args
|
21
|
+
# Arguments to run the runner with.
|
22
|
+
#
|
23
|
+
def run(*args)
|
24
|
+
optparse(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
|
29
|
+
#
|
30
|
+
# Prints the given error message.
|
31
|
+
#
|
32
|
+
# @param [String] message
|
33
|
+
# The error message to print.
|
34
|
+
#
|
35
|
+
def print_error(message)
|
36
|
+
$stderr.puts "#{$0}: #{message}"
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Parses the given arguments.
|
41
|
+
#
|
42
|
+
# @param [Array<String>] args
|
43
|
+
# Arguments to parse.
|
44
|
+
#
|
45
|
+
# @yield [opts]
|
46
|
+
# If a block is given, it will be passed the option parse to be
|
47
|
+
# configured.
|
48
|
+
#
|
49
|
+
# @yieldparam [OptionParser] opts
|
50
|
+
# The option parser to be configured.
|
51
|
+
#
|
52
|
+
def optparse(*args)
|
53
|
+
opts = OptionParser.new()
|
54
|
+
|
55
|
+
yield opts if block_given?
|
56
|
+
|
57
|
+
begin
|
58
|
+
opts.parse!(args)
|
59
|
+
rescue OptionParser::InvalidOption => e
|
60
|
+
$stderr.puts e.message
|
61
|
+
$stderr.puts opts
|
62
|
+
exit -1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -14,8 +14,13 @@ module Wordlist
|
|
14
14
|
end
|
15
15
|
|
16
16
|
#
|
17
|
-
#
|
18
|
-
#
|
17
|
+
# Determines if the given word has been previously seen.
|
18
|
+
#
|
19
|
+
# @param [String] word
|
20
|
+
# The word to check for.
|
21
|
+
#
|
22
|
+
# @return [Boolean]
|
23
|
+
# Specifies whether the word has been previously seen.
|
19
24
|
#
|
20
25
|
def seen?(word)
|
21
26
|
length = word.length
|
@@ -24,12 +29,18 @@ module Wordlist
|
|
24
29
|
end
|
25
30
|
|
26
31
|
#
|
27
|
-
# Marks the
|
28
|
-
#
|
32
|
+
# Marks the given word as previously seen.
|
33
|
+
#
|
34
|
+
# @param [String] word
|
35
|
+
# The word to mark as previously seen.
|
36
|
+
#
|
37
|
+
# @return [Boolean]
|
38
|
+
# Specifies whether or not the word has not been previously seen
|
39
|
+
# until now.
|
29
40
|
#
|
30
41
|
def saw!(word)
|
31
42
|
length = word.length
|
32
|
-
crc
|
43
|
+
crc = crc32(word)
|
33
44
|
|
34
45
|
if @seen.has_key?(length)
|
35
46
|
return false if @seen[length].include?(crc)
|
@@ -42,8 +53,19 @@ module Wordlist
|
|
42
53
|
end
|
43
54
|
|
44
55
|
#
|
45
|
-
# Passes the
|
46
|
-
#
|
56
|
+
# Passes the given word through the unique filter.
|
57
|
+
#
|
58
|
+
# @param [String] word
|
59
|
+
# The word to pass through the unique filter.
|
60
|
+
#
|
61
|
+
# @yield [word]
|
62
|
+
# The given block will be passed the word, if the word has not been
|
63
|
+
# previously seen by the filter.
|
64
|
+
#
|
65
|
+
# @yieldparam [String] word
|
66
|
+
# A unique word that has not been previously seen by the filter.
|
67
|
+
#
|
68
|
+
# @return [nil]
|
47
69
|
#
|
48
70
|
def pass(word)
|
49
71
|
if saw!(word)
|
@@ -53,10 +75,27 @@ module Wordlist
|
|
53
75
|
return nil
|
54
76
|
end
|
55
77
|
|
78
|
+
#
|
79
|
+
# Clears the unique filter.
|
80
|
+
#
|
81
|
+
# @return [UniqueFilter]
|
82
|
+
# The cleared filter.
|
83
|
+
#
|
84
|
+
def clear
|
85
|
+
@seen.clear
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
56
89
|
protected
|
57
90
|
|
58
91
|
#
|
59
|
-
# Returns the CRC32 checksum of the
|
92
|
+
# Returns the CRC32 checksum of the given word.
|
93
|
+
#
|
94
|
+
# @param [String] word
|
95
|
+
# The word to calculate a CRC32 checksum for.
|
96
|
+
#
|
97
|
+
# @return [Integer]
|
98
|
+
# The CRC32 checksum for the given word.
|
60
99
|
#
|
61
100
|
def crc32(word)
|
62
101
|
r = 0xffffffff
|
data/lib/wordlist/version.rb
CHANGED
data/scripts/benchmark
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__),'..','lib')))
|
3
3
|
|
4
|
-
require 'wordlist
|
4
|
+
require 'wordlist'
|
5
5
|
require 'benchmark'
|
6
6
|
require 'fileutils'
|
7
7
|
|
@@ -10,9 +10,50 @@ path = File.expand_path(File.join(File.dirname(__FILE__),'shakespeare_wordlist.t
|
|
10
10
|
FileUtils.rm_f(path)
|
11
11
|
|
12
12
|
Benchmark.bm do |bm|
|
13
|
-
bm.report('build
|
13
|
+
bm.report('build') do
|
14
14
|
Wordlist::Builder.build(path) do |wordlist|
|
15
15
|
wordlist.parse_file('/home/hal/shaks12.txt')
|
16
16
|
end
|
17
17
|
end
|
18
|
+
|
19
|
+
bm.report('each_unique') do
|
20
|
+
Wordlist::FlatFile.new(path) do |wordlist|
|
21
|
+
wordlist.each_unique { |word| word }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
bm.report('each_mutation (1)') do
|
26
|
+
Wordlist::FlatFile.new(path) do |wordlist|
|
27
|
+
wordlist.mutate /o/i, '0'
|
28
|
+
|
29
|
+
wordlist.each_mutation { |word| word }
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
bm.report('each_mutation (2)') do
|
34
|
+
Wordlist::FlatFile.new(path) do |wordlist|
|
35
|
+
wordlist.mutate /o/i, '0'
|
36
|
+
wordlist.mutate /a/i, '@'
|
37
|
+
|
38
|
+
wordlist.each_mutation { |word| word }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
bm.report('each_mutation (3)') do
|
43
|
+
Wordlist::FlatFile.new(path) do |wordlist|
|
44
|
+
wordlist.mutate /o/i, '0'
|
45
|
+
wordlist.mutate /a/i, '@'
|
46
|
+
wordlist.mutate /e/i, '3'
|
47
|
+
|
48
|
+
wordlist.each_mutation { |word| word }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
Benchmark.bm do |bm|
|
54
|
+
mutator = Wordlist::Mutator.new(/o/i, '0')
|
55
|
+
|
56
|
+
bm.report('Mutator#each') do
|
57
|
+
mutator.each('lololololoLOLOLOLOLO') { |word| }
|
58
|
+
end
|
18
59
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'helpers/text'
|
3
|
+
require 'helpers/wordlist'
|
4
|
+
|
5
|
+
shared_examples_for "a wordlist Builder" do
|
6
|
+
include Helpers
|
7
|
+
|
8
|
+
before(:all) do
|
9
|
+
@words = ['dog', 'cat', 'catx', 'dat', 'dog', 'cat']
|
10
|
+
@sentence = 'dog cat catx, dog dat.'
|
11
|
+
@text = 'dog cat: catx. dog cat dat dog.'
|
12
|
+
@file = Helpers::SAMPLE_TEXT
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should build a unique wordlist from words" do
|
16
|
+
Builder.build(@path) do |wordlist|
|
17
|
+
wordlist += @words
|
18
|
+
end
|
19
|
+
|
20
|
+
should_contain_words(@path,@expected)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should build a unique wordlist from a sentence" do
|
24
|
+
Builder.build(@path) do |wordlist|
|
25
|
+
wordlist.parse(@sentence)
|
26
|
+
end
|
27
|
+
|
28
|
+
should_contain_words(@path,@expected)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should build a unique wordlist from text" do
|
32
|
+
Builder.build(@path) do |wordlist|
|
33
|
+
wordlist.parse(@text)
|
34
|
+
end
|
35
|
+
|
36
|
+
should_contain_words(@path,@expected)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should build a unique wordlist from a file" do
|
40
|
+
Builder.build(@path) do |wordlist|
|
41
|
+
wordlist.parse_file(@file)
|
42
|
+
end
|
43
|
+
|
44
|
+
should_contain_words(@path,@expected)
|
45
|
+
end
|
46
|
+
end
|