wordlist 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (148) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +27 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +45 -1
  5. data/Gemfile +13 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +266 -61
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +4 -10
  54. data/lib/wordlist/abstract_wordlist.rb +24 -0
  55. data/lib/wordlist/builder.rb +170 -138
  56. data/lib/wordlist/cli.rb +458 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +176 -0
  61. data/lib/wordlist/format.rb +38 -0
  62. data/lib/wordlist/lexer/lang.rb +32 -0
  63. data/lib/wordlist/lexer/stop_words.rb +68 -0
  64. data/lib/wordlist/lexer.rb +218 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +45 -0
  67. data/lib/wordlist/modifiers/downcase.rb +45 -0
  68. data/lib/wordlist/modifiers/gsub.rb +51 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +133 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +25 -0
  72. data/lib/wordlist/modifiers/sub.rb +97 -0
  73. data/lib/wordlist/modifiers/tr.rb +71 -0
  74. data/lib/wordlist/modifiers/upcase.rb +45 -0
  75. data/lib/wordlist/modifiers.rb +8 -0
  76. data/lib/wordlist/operators/binary_operator.rb +38 -0
  77. data/lib/wordlist/operators/concat.rb +47 -0
  78. data/lib/wordlist/operators/intersect.rb +55 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +72 -0
  81. data/lib/wordlist/operators/product.rb +50 -0
  82. data/lib/wordlist/operators/subtract.rb +54 -0
  83. data/lib/wordlist/operators/unary_operator.rb +29 -0
  84. data/lib/wordlist/operators/union.rb +61 -0
  85. data/lib/wordlist/operators/unique.rb +52 -0
  86. data/lib/wordlist/operators.rb +7 -0
  87. data/lib/wordlist/unique_filter.rb +40 -61
  88. data/lib/wordlist/version.rb +1 -1
  89. data/lib/wordlist/words.rb +71 -0
  90. data/lib/wordlist.rb +103 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +801 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +258 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +652 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/union_spec.rb +37 -0
  128. data/spec/operators/unique_spec.rb +25 -0
  129. data/spec/spec_helper.rb +2 -1
  130. data/spec/unique_filter_spec.rb +108 -18
  131. data/spec/wordlist_spec.rb +55 -3
  132. data/spec/words_spec.rb +41 -0
  133. metadata +183 -120
  134. data/lib/wordlist/builders/website.rb +0 -216
  135. data/lib/wordlist/builders.rb +0 -1
  136. data/lib/wordlist/flat_file.rb +0 -47
  137. data/lib/wordlist/list.rb +0 -162
  138. data/lib/wordlist/mutator.rb +0 -113
  139. data/lib/wordlist/parsers.rb +0 -74
  140. data/lib/wordlist/runners/list.rb +0 -116
  141. data/lib/wordlist/runners/runner.rb +0 -67
  142. data/lib/wordlist/runners.rb +0 -2
  143. data/scripts/benchmark +0 -59
  144. data/scripts/text/comedy_of_errors.txt +0 -4011
  145. data/spec/flat_file_spec.rb +0 -25
  146. data/spec/list_spec.rb +0 -58
  147. data/spec/mutator_spec.rb +0 -43
  148. data/spec/parsers_spec.rb +0 -118
@@ -0,0 +1,125 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ 使
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ 沿
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
data/gemspec.yml CHANGED
@@ -1,21 +1,15 @@
1
1
  name: wordlist
2
- summary: A Ruby library for generating and working with word-lists.
2
+ summary: Ruby library for reading, manipulating, and creating wordlists.
3
3
  description:
4
- A Ruby library for generating and working with word-lists. Wordlist
5
- allows one to efficiently generate unique word-lists from arbitrary text
6
- or other sources, such as website content. Wordlist can also quickly
7
- enumerate through words within an existing word-list, applying multiple
8
- mutation rules to each word in the list.
4
+ Wordlist is a Ruby library for reading, manipulating, and creating wordlists,
5
+ efficiently.
9
6
 
10
7
  license: MIT
11
8
  authors: Postmodern
12
9
  email: postmodern.mod3@gmail.com
13
- homepage: https://github.com/sophsec/wordlist
10
+ homepage: https://github.com/postmodern/wordlist.rb
14
11
  has_yard: true
15
12
 
16
- dependencies:
17
- spidr: ~> 0.2
18
-
19
13
  development_dependencies:
20
14
  rubygems-tasks: ~> 0.1
21
15
  rspec: ~> 2.4
@@ -0,0 +1,24 @@
1
+ require 'wordlist/list_methods'
2
+
3
+ module Wordlist
4
+ #
5
+ # The base class for all wordlist classes.
6
+ #
7
+ # @since 1.0.0
8
+ #
9
+ class AbstractWordlist
10
+
11
+ include Enumerable
12
+ include ListMethods
13
+
14
+ #
15
+ # Place holder method.
16
+ #
17
+ # @abstract
18
+ #
19
+ def each(&block)
20
+ raise(NotImplementedError,"#{self.class}#each was not implemented")
21
+ end
22
+
23
+ end
24
+ end
@@ -1,58 +1,101 @@
1
+ require 'wordlist/format'
2
+ require 'wordlist/lexer'
1
3
  require 'wordlist/unique_filter'
2
- require 'wordlist/parsers'
4
+ require 'wordlist/file'
5
+ require 'wordlist/compression/writer'
3
6
 
4
7
  module Wordlist
8
+ #
9
+ # Parses text and builds a wordlist file.
10
+ #
11
+ # @api public
12
+ #
13
+ # @since 1.0.0
14
+ #
5
15
  class Builder
6
16
 
7
- include Parsers
8
-
9
- # Path of the word-list
17
+ # Path of the wordlist
18
+ #
19
+ # @return [String]
10
20
  attr_reader :path
11
21
 
12
- # Minimum number of words
13
- attr_reader :min_words
14
-
15
- # Maximum number of words
16
- attr_reader :max_words
17
-
18
- # File for the word-list
19
- attr_reader :file
22
+ # The format of the wordlist file.
23
+ #
24
+ # @return [:txt, :gzip, :bzip2, :xz]
25
+ attr_reader :format
20
26
 
21
- # The unique word filter
22
- attr_reader :filter
27
+ # The word lexer.
28
+ #
29
+ # @return [Lexer]
30
+ attr_reader :lexer
23
31
 
24
- # The queue of words awaiting processing
25
- attr_reader :word_queue
32
+ # The unique filter.
33
+ #
34
+ # @return [UniqueFilter]
35
+ attr_reader :unique_filter
26
36
 
27
37
  #
28
38
  # Creates a new word-list Builder object.
29
39
  #
30
40
  # @param [String] path
31
- # The path of the word-list file.
41
+ # The path of the wordlist file.
32
42
  #
33
- # @param [Hash] options
34
- # Additional options.
43
+ # @param [:txt, :gz, :bzip2, :xz, nil] format
44
+ # The format of the wordlist. If not given the format will be inferred
45
+ # from the file extension.
35
46
  #
36
- # @option options [Integer] :min_words (1)
37
- # The minimum number of words each line of the word-list must contain.
47
+ # @param [Boolean] append
48
+ # Indicates whether new words will be appended to the wordlist or
49
+ # overwrite the wordlist.
38
50
  #
39
- # @option options [Integer] :max_words
40
- # The maximum number of words each line of the word-list may contain.
41
- # Defaults to the value of `:min_words`, if not given.
51
+ # @param [Hash{Symbol => Object}] kwargs
52
+ # Additional keyword arguments for {Lexer#initialize}.
42
53
  #
43
- def initialize(path,options={})
44
- super()
45
-
46
- @path = File.expand_path(path)
47
-
48
- @min_words = options.fetch(:min_words,1)
49
- @max_words = options.fetch(:max_words,@min_words)
54
+ # @option kwargs [Symbol] :lang
55
+ # The language to use. Defaults to {Lexer::Lang.default}.
56
+ #
57
+ # @option kwargs [Array<String>] :stop_words
58
+ # The explicit stop-words to ignore. If not given, default stop words
59
+ # will be loaded based on `lang` or {Lexer::Lang.default}.
60
+ #
61
+ # @option kwargs [Array<String, Regexp>] :ignore_words
62
+ # Optional list of words to ignore. Can contain Strings or Regexps.
63
+ #
64
+ # @option kwargs [Boolean] :digits
65
+ # Controls whether parsed words may contain digits or not.
66
+ #
67
+ # @option kwargs [Array<String>] :special_chars
68
+ # The additional special characters allowed within words.
69
+ #
70
+ # @option kwargs [Boolean] :numbers
71
+ # Controls whether whole numbers will be parsed as words.
72
+ #
73
+ # @option kwargs [Boolean] :acronyms
74
+ # Controls whether acronyms will be parsed as words.
75
+ #
76
+ # @option kwargs [Boolean] :normalize_case
77
+ # Controls whether to convert all words to lowercase.
78
+ #
79
+ # @option kwargs [Boolean] :normalize_apostrophes
80
+ # Controls whether apostrophes will be removed from the end of words.
81
+ #
82
+ # @option kwargs [Boolean] :normalize_acronyms
83
+ # Controls whether acronyms will have `.` characters removed.
84
+ #
85
+ # @raise [ArgumentError]
86
+ # The format could not be inferred from the file extension, or the
87
+ # `ignore_words` keyword contained a value other than a String or Regexp.
88
+ #
89
+ def initialize(path, format: Format.infer(path), append: false, **kwargs)
90
+ @path = ::File.expand_path(path)
91
+ @format = format
92
+ @append = append
50
93
 
51
- @file = nil
52
- @filter = UniqueFilter.new
53
- @word_queue = []
94
+ @lexer = Lexer.new(**kwargs)
95
+ @unique_filter = UniqueFilter.new
54
96
 
55
- yield self if block_given?
97
+ load! if append? && ::File.file?(@path)
98
+ open!
56
99
  end
57
100
 
58
101
  #
@@ -60,151 +103,94 @@ module Wordlist
60
103
  # word-list file, passes the builder object to the given block
61
104
  # then finally closes the word-list file.
62
105
  #
63
- # @param [Array] arguments
64
- # Additional arguments to pass to {#initialize}.
106
+ # @param [String] path
107
+ # The path of the wordlist file.
65
108
  #
66
109
  # @yield [builder]
67
110
  # If a block is given, it will be passed the new builder.
68
111
  #
69
- # @yieldparam [Builder] builder
70
- # The newly created builer object.
112
+ # @yieldparam [self] builder
113
+ # The newly created builder object.
71
114
  #
72
115
  # @return [Builder]
73
116
  # The newly created builder object.
74
117
  #
75
118
  # @example
76
- # Builder.build('some/path') do |builder|
77
- # builder.parse(readline)
119
+ # Builder.open('path/to/file.txt') do |builder|
120
+ # builder.parse(text)
78
121
  # end
79
122
  #
80
- def self.build(*arguments,&block)
81
- self.new(*arguments) do |builder|
82
- builder.open!
83
- builder.build!(&block)
84
- builder.close!
85
- end
86
- end
123
+ def self.open(path,**kwargs)
124
+ builder = new(path,**kwargs)
87
125
 
88
- #
89
- # Opens the word-list file for writing. If the file already exists, the
90
- # previous words will be used to filter future duplicate words.
91
- #
92
- # @return [File]
93
- # The open word-list file.
94
- #
95
- def open!
96
- if File.file?(@path)
97
- File.open(@path) do |file|
98
- file.each_line do |line|
99
- @filter.saw!(line.chomp)
100
- end
126
+ if block_given?
127
+ begin
128
+ yield builder
129
+ ensure
130
+ builder.close
101
131
  end
102
132
  end
103
133
 
104
- @file = File.new(@path,File::RDWR | File::CREAT | File::APPEND)
134
+ return builder
105
135
  end
106
136
 
107
137
  #
108
- # Default to be called when the word-list is to be built.
138
+ # Determines if the builder will append new words to the existing wordlist
139
+ # or overwrite it.
109
140
  #
110
- # @yield [builder]
111
- # If a block is given, it will be passed the new builder object.
141
+ # @return [Boolean]
112
142
  #
113
- def build!
114
- yield self if block_given?
143
+ def append?
144
+ @append
115
145
  end
116
146
 
117
147
  #
118
- # Enqueues a given word for processing.
148
+ # Writes a comment line to the wordlist file.
119
149
  #
120
- # @param [String] word
121
- # The word to enqueue.
122
- #
123
- # @return [String]
124
- # The enqueued word.
150
+ # @param [String] message
151
+ # The comment message to write.
125
152
  #
126
- def enqueue(word)
127
- # enqueue the word
128
- if @max_words == 1
129
- @word_queue[0] = word.to_s
130
- else
131
- @word_queue << word.to_s
132
-
133
- # make sure the queue does not overflow
134
- if @word_queue.length > @max_words
135
- @word_queue.shift
136
- end
137
- end
138
-
139
- return word
153
+ def comment(message)
154
+ write("# #{message}")
140
155
  end
141
156
 
142
157
  #
143
- # Enumerates over the combinations of previously seen words.
144
- #
145
- # @yield [combination]
146
- # The given block will be passed the combinations of previously
147
- # seen words.
148
- #
149
- # @yieldparam [String] combination
150
- # A combination of one or more space-separated words.
151
- #
152
- def word_combinations
153
- if @max_words == 1
154
- yield @word_queue[0]
155
- else
156
- current_words = @word_queue.length
157
-
158
- # we must have atleast the minimum amount of words
159
- if current_words >= @min_words
160
- upper_bound = (current_words - @min_words)
161
-
162
- # combine the words
163
- upper_bound.downto(0) do |i|
164
- yield @word_queue[i..-1].join(' ')
165
- end
166
- end
167
- end
168
- end
169
-
170
- #
171
- # Appends the given word to the word-list file, only if it has not
172
- # been previously seen.
158
+ # Appends the given word to the wordlist file, only if it has not
159
+ # been previously added.
173
160
  #
174
161
  # @param [String] word
175
162
  # The word to append.
176
163
  #
177
- # @return [Builder]
164
+ # @return [self]
178
165
  # The builder object.
179
166
  #
180
- def <<(word)
181
- enqueue(word)
182
-
183
- if @file
184
- word_combinations do |words|
185
- @filter.pass(words) do |unique|
186
- @file.puts unique
187
- end
188
- end
167
+ def add(word)
168
+ if @unique_filter.add?(word)
169
+ write(word)
189
170
  end
190
171
 
191
172
  return self
192
173
  end
193
174
 
175
+ alias << add
176
+ alias push add
177
+
194
178
  #
195
179
  # Add the given words to the word-list.
196
180
  #
197
181
  # @param [Array<String>] words
198
182
  # The words to add to the list.
199
183
  #
200
- # @return [Builder]
184
+ # @return [self]
201
185
  # The builder object.
202
186
  #
203
- def +(words)
204
- words.each { |word| self << word }
187
+ def append(words)
188
+ words.each { |word| add(word) }
205
189
  return self
206
190
  end
207
191
 
192
+ alias concat append
193
+
208
194
  #
209
195
  # Parses the given text, adding each unique word to the word-list file.
210
196
  #
@@ -212,7 +198,9 @@ module Wordlist
212
198
  # The text to parse.
213
199
  #
214
200
  def parse(text)
215
- super(text).each { |word| self << word }
201
+ @lexer.parse(text) do |word|
202
+ add(word)
203
+ end
216
204
  end
217
205
 
218
206
  #
@@ -223,7 +211,7 @@ module Wordlist
223
211
  # The path of the file to parse.
224
212
  #
225
213
  def parse_file(path)
226
- File.open(path) do |file|
214
+ ::File.open(path) do |file|
227
215
  file.each_line do |line|
228
216
  parse(line)
229
217
  end
@@ -233,13 +221,57 @@ module Wordlist
233
221
  #
234
222
  # Closes the word-list file.
235
223
  #
236
- def close!
237
- if @file
238
- @file.close
239
- @file = nil
224
+ def close
225
+ unless @io.closed?
226
+ @io.close
227
+ @unique_filter.clear
228
+ end
229
+ end
240
230
 
241
- @filter.clear
242
- @word_queue.clear
231
+ #
232
+ # Indicates whether the wordlist builder has been closed.
233
+ #
234
+ # @return [Boolean]
235
+ #
236
+ def closed?
237
+ @io.closed?
238
+ end
239
+
240
+ private
241
+
242
+ #
243
+ # Prepopulates the builder with the existing wordlist's content.
244
+ #
245
+ def load!
246
+ Wordlist::File.read(@path) do |word|
247
+ @unique_filter << word
248
+ end
249
+ end
250
+
251
+ #
252
+ # Writes a line to the wordlist file.
253
+ #
254
+ # @param [String] line
255
+ # The line to write.
256
+ #
257
+ # @abstract
258
+ #
259
+ def write(line)
260
+ @io.puts(line)
261
+ end
262
+
263
+ #
264
+ # Opens the wordlist file.
265
+ #
266
+ def open!
267
+ if @format == :txt
268
+ mode = if append? then 'a'
269
+ else 'w'
270
+ end
271
+
272
+ @io = ::File.open(@path,mode)
273
+ else
274
+ @io = Compression::Writer.open(@path, format: @format, append: append?)
243
275
  end
244
276
  end
245
277