wordlist 0.1.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (152) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +28 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +55 -1
  5. data/Gemfile +15 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +301 -60
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +13 -12
  54. data/lib/wordlist/abstract_wordlist.rb +25 -0
  55. data/lib/wordlist/builder.rb +172 -138
  56. data/lib/wordlist/cli.rb +459 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +177 -0
  61. data/lib/wordlist/format.rb +39 -0
  62. data/lib/wordlist/lexer/lang.rb +34 -0
  63. data/lib/wordlist/lexer/stop_words.rb +69 -0
  64. data/lib/wordlist/lexer.rb +221 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +46 -0
  67. data/lib/wordlist/modifiers/downcase.rb +46 -0
  68. data/lib/wordlist/modifiers/gsub.rb +52 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +134 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +26 -0
  72. data/lib/wordlist/modifiers/sub.rb +98 -0
  73. data/lib/wordlist/modifiers/tr.rb +72 -0
  74. data/lib/wordlist/modifiers/upcase.rb +46 -0
  75. data/lib/wordlist/modifiers.rb +9 -0
  76. data/lib/wordlist/operators/binary_operator.rb +39 -0
  77. data/lib/wordlist/operators/concat.rb +48 -0
  78. data/lib/wordlist/operators/intersect.rb +56 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +73 -0
  81. data/lib/wordlist/operators/product.rb +51 -0
  82. data/lib/wordlist/operators/subtract.rb +55 -0
  83. data/lib/wordlist/operators/unary_operator.rb +30 -0
  84. data/lib/wordlist/operators/union.rb +62 -0
  85. data/lib/wordlist/operators/unique.rb +53 -0
  86. data/lib/wordlist/operators.rb +8 -0
  87. data/lib/wordlist/unique_filter.rb +41 -61
  88. data/lib/wordlist/version.rb +4 -2
  89. data/lib/wordlist/words.rb +72 -0
  90. data/lib/wordlist.rb +104 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +802 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +269 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +718 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/unary_operator_spec.rb +14 -0
  128. data/spec/operators/union_spec.rb +37 -0
  129. data/spec/operators/unique_spec.rb +25 -0
  130. data/spec/spec_helper.rb +2 -1
  131. data/spec/unique_filter_spec.rb +108 -18
  132. data/spec/wordlist_spec.rb +55 -3
  133. data/spec/words_spec.rb +41 -0
  134. data/wordlist.gemspec +1 -0
  135. metadata +164 -126
  136. data/lib/wordlist/builders/website.rb +0 -216
  137. data/lib/wordlist/builders.rb +0 -1
  138. data/lib/wordlist/flat_file.rb +0 -47
  139. data/lib/wordlist/list.rb +0 -162
  140. data/lib/wordlist/mutator.rb +0 -113
  141. data/lib/wordlist/parsers.rb +0 -74
  142. data/lib/wordlist/runners/list.rb +0 -116
  143. data/lib/wordlist/runners/runner.rb +0 -67
  144. data/lib/wordlist/runners.rb +0 -2
  145. data/scripts/benchmark +0 -59
  146. data/scripts/text/comedy_of_errors.txt +0 -4011
  147. data/spec/classes/parser_class.rb +0 -7
  148. data/spec/classes/test_list.rb +0 -9
  149. data/spec/flat_file_spec.rb +0 -25
  150. data/spec/list_spec.rb +0 -58
  151. data/spec/mutator_spec.rb +0 -43
  152. data/spec/parsers_spec.rb +0 -118
@@ -1,7 +0,0 @@
1
- require 'wordlist/parsers'
2
-
3
- class ParserClass
4
-
5
- include Wordlist::Parsers
6
-
7
- end
@@ -1,9 +0,0 @@
1
- require 'wordlist/list'
2
-
3
- class TestList < Wordlist::List
4
-
5
- def each_word
6
- yield 'omg.hackers'
7
- end
8
-
9
- end
@@ -1,25 +0,0 @@
1
- require 'wordlist/flat_file'
2
-
3
- require 'spec_helper'
4
-
5
- describe FlatFile do
6
- before(:all) do
7
- @path = File.join(File.dirname(__FILE__),'text','flat_file.txt')
8
- @list = FlatFile.new(@path)
9
- end
10
-
11
- it "should have a path it reads from" do
12
- @list.path.should == @path
13
- end
14
-
15
- it "should read the lines of the flat-file" do
16
- words = ['one', 'two', 'three']
17
-
18
- @list.each_word do |word|
19
- words.include?(word).should == true
20
- words.delete(word)
21
- end
22
-
23
- words.should == []
24
- end
25
- end
data/spec/list_spec.rb DELETED
@@ -1,58 +0,0 @@
1
- require 'wordlist/list'
2
-
3
- require 'spec_helper'
4
- require 'classes/test_list'
5
-
6
- describe List do
7
- before(:all) do
8
- @source = TestList.new
9
- @source.mutate 'o', '0'
10
- @source.mutate 'a', 'A'
11
- @source.mutate 'e', '3'
12
- @source.mutate 's', '5'
13
- end
14
-
15
- it "should iterate over each word" do
16
- words = []
17
-
18
- @source.each_word { |word| words << word }
19
-
20
- words.should == ['omg.hackers']
21
- end
22
-
23
- it "should iterate over each unique word" do
24
- words = []
25
-
26
- @source.each_unique { |word| words << word }
27
-
28
- words.should == ['omg.hackers']
29
- end
30
-
31
- it "should iterate over every possible mutated word" do
32
- mutations = %w{
33
- 0mg.hAck3r5
34
- 0mg.hAck3rs
35
- 0mg.hAcker5
36
- 0mg.hAckers
37
- 0mg.hack3r5
38
- 0mg.hack3rs
39
- 0mg.hacker5
40
- 0mg.hackers
41
- omg.hAck3r5
42
- omg.hAck3rs
43
- omg.hAcker5
44
- omg.hAckers
45
- omg.hack3r5
46
- omg.hack3rs
47
- omg.hacker5
48
- omg.hackers
49
- }
50
-
51
- @source.each_mutation do |mutation|
52
- mutations.include?(mutation).should == true
53
- mutations.delete(mutation)
54
- end
55
-
56
- mutations.should == []
57
- end
58
- end
data/spec/mutator_spec.rb DELETED
@@ -1,43 +0,0 @@
1
- require 'wordlist/mutator'
2
-
3
- require 'spec_helper'
4
-
5
- describe Mutator do
6
- it "should replace matched text with a byte" do
7
- mutator = Mutator.new('o',0x41)
8
- mutator.replace('o').should == 'A'
9
- end
10
-
11
- it "should replace matched text with a String" do
12
- mutator = Mutator.new('o','0')
13
- mutator.replace('o').should == '0'
14
- end
15
-
16
- it "should replace matched text using a proc" do
17
- mutator = Mutator.new('o') { |match| match * 2 }
18
- mutator.replace('o').should == 'oo'
19
- end
20
-
21
- it "should iterate over every possible substitution" do
22
- remaining = ['lolol', 'l0lol', 'lol0l', 'l0l0l']
23
-
24
- mutator = Mutator.new(/o/,'0')
25
- mutator.each('lolol') do |mutation|
26
- remaining.include?(mutation).should == true
27
- remaining.delete(mutation)
28
- end
29
-
30
- remaining.should == []
31
- end
32
-
33
- it "should iterate over the original word, if no matches were found" do
34
- mutations = []
35
- mutator = Mutator.new('x','0')
36
-
37
- mutator.each('hello') do |mutant|
38
- mutations << mutant
39
- end
40
-
41
- mutations.should == ['hello']
42
- end
43
- end
data/spec/parsers_spec.rb DELETED
@@ -1,118 +0,0 @@
1
- require 'spec_helper'
2
- require 'classes/parser_class'
3
-
4
- describe Parsers do
5
- describe "default" do
6
- before(:all) do
7
- @parser = ParserClass.new
8
- end
9
-
10
- it "should parse words from a sentence" do
11
- sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
12
- words = %w{The Deliverator is in touch with the road starts like a bad day stops on a peseta}
13
-
14
- @parser.parse(sentence).should == words
15
- end
16
-
17
- it "should ignore punctuation by default while parsing a sentence" do
18
- sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
19
- words = %w{
20
- Oh they used to argue over times many corporate driver-years lost to it homeowners red-faced and sweaty with their own lies stinking of Old Spice and job-related stress standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink I swear can't you guys tell time
21
- }
22
-
23
- @parser.parse(sentence).should == words
24
- end
25
-
26
- it "should ignore URLs by default while parsing a sentence" do
27
- sentence = %{Click on the following link: http://www.example.com/}
28
- words = %w{Click on the following link}
29
-
30
- @parser.parse(sentence).should == words
31
- end
32
-
33
- it "should ignore short URIs by default while parsing a sentence" do
34
- sentence = %{Click on the following link: jabber://}
35
- words = %w{Click on the following link}
36
-
37
- @parser.parse(sentence).should == words
38
- end
39
-
40
- it "should ignore complex HTTP URLs by default while parsing a sentence" do
41
- sentence = %{Click on the following link: http://www.google.com/search?hl=en&client=firefox-a&rls=org.mozilla:en-US:official&hs=jU&q=ruby+datamapper&start=20&sa=N}
42
- words = %w{Click on the following link}
43
-
44
- @parser.parse(sentence).should == words
45
- end
46
- end
47
-
48
- describe "ignoring phone numbers" do
49
- before(:all) do
50
- @parser = ParserClass.new
51
- @parser.ignore_phone_numbers = true
52
- end
53
-
54
- it "may ignore phone numbers while parsing a sentence" do
55
- sentence = %{Call me before 12, 1-888-444-2222.}
56
- words = %w{Call me before 12}
57
-
58
- @parser.parse(sentence).should == words
59
- end
60
-
61
- it "may ignore long-distance phone numbers while parsing a sentence" do
62
- sentence = %{Call me before 12, 1-444-2222.}
63
- words = %w{Call me before 12}
64
-
65
- @parser.parse(sentence).should == words
66
- end
67
-
68
- it "may ignore short phone numbers while parsing a sentence" do
69
- sentence = %{Call me before 12, 444-2222.}
70
- words = %w{Call me before 12}
71
-
72
- @parser.parse(sentence).should == words
73
- end
74
- end
75
-
76
- describe "ignoring references" do
77
- before(:all) do
78
- @parser = ParserClass.new
79
- @parser.ignore_references = true
80
- end
81
-
82
- it "may ignore RFC style references while parsing a sentence" do
83
- sentence = %{As one can see, it has failed [1].}
84
- words = %w{As one can see it has failed}
85
-
86
- @parser.parse(sentence).should == words
87
- end
88
- end
89
-
90
- describe "ignoring case" do
91
- before(:all) do
92
- @parser = ParserClass.new
93
- @parser.ignore_case = true
94
- end
95
-
96
- it "may ignore case while parsing a sentence" do
97
- sentence = %{The Deliverator is in touch with the road, starts like a bad day, stops on a peseta.}
98
- words = %w{the deliverator is in touch with the road starts like a bad day stops on a peseta}
99
-
100
- @parser.ignore_case = true
101
- @parser.parse(sentence).should == words
102
- end
103
- end
104
-
105
- describe "preserving punctuation" do
106
- before(:all) do
107
- @parser = ParserClass.new
108
- @parser.ignore_punctuation = false
109
- end
110
-
111
- it "may preserve punctuation while parsing a sentence" do
112
- sentence = %{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
113
- words = %w{Oh, they used to argue over times, many corporate driver-years lost to it: homeowners, red-faced and sweaty with their own lies, stinking of Old Spice and job-related stress, standing in their glowing yellow doorways brandishing their Seikos and waving at the clock over the kitchen sink, I swear, can't you guys tell time?}
114
-
115
- @parser.parse(sentence).should == words
116
- end
117
- end
118
- end