wordlist 0.1.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (152) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +28 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +55 -1
  5. data/Gemfile +15 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +301 -60
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +13 -12
  54. data/lib/wordlist/abstract_wordlist.rb +25 -0
  55. data/lib/wordlist/builder.rb +172 -138
  56. data/lib/wordlist/cli.rb +459 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +177 -0
  61. data/lib/wordlist/format.rb +39 -0
  62. data/lib/wordlist/lexer/lang.rb +34 -0
  63. data/lib/wordlist/lexer/stop_words.rb +69 -0
  64. data/lib/wordlist/lexer.rb +221 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +46 -0
  67. data/lib/wordlist/modifiers/downcase.rb +46 -0
  68. data/lib/wordlist/modifiers/gsub.rb +52 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +134 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +26 -0
  72. data/lib/wordlist/modifiers/sub.rb +98 -0
  73. data/lib/wordlist/modifiers/tr.rb +72 -0
  74. data/lib/wordlist/modifiers/upcase.rb +46 -0
  75. data/lib/wordlist/modifiers.rb +9 -0
  76. data/lib/wordlist/operators/binary_operator.rb +39 -0
  77. data/lib/wordlist/operators/concat.rb +48 -0
  78. data/lib/wordlist/operators/intersect.rb +56 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +73 -0
  81. data/lib/wordlist/operators/product.rb +51 -0
  82. data/lib/wordlist/operators/subtract.rb +55 -0
  83. data/lib/wordlist/operators/unary_operator.rb +30 -0
  84. data/lib/wordlist/operators/union.rb +62 -0
  85. data/lib/wordlist/operators/unique.rb +53 -0
  86. data/lib/wordlist/operators.rb +8 -0
  87. data/lib/wordlist/unique_filter.rb +41 -61
  88. data/lib/wordlist/version.rb +4 -2
  89. data/lib/wordlist/words.rb +72 -0
  90. data/lib/wordlist.rb +104 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +802 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +269 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +718 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/unary_operator_spec.rb +14 -0
  128. data/spec/operators/union_spec.rb +37 -0
  129. data/spec/operators/unique_spec.rb +25 -0
  130. data/spec/spec_helper.rb +2 -1
  131. data/spec/unique_filter_spec.rb +108 -18
  132. data/spec/wordlist_spec.rb +55 -3
  133. data/spec/words_spec.rb +41 -0
  134. data/wordlist.gemspec +1 -0
  135. metadata +164 -126
  136. data/lib/wordlist/builders/website.rb +0 -216
  137. data/lib/wordlist/builders.rb +0 -1
  138. data/lib/wordlist/flat_file.rb +0 -47
  139. data/lib/wordlist/list.rb +0 -162
  140. data/lib/wordlist/mutator.rb +0 -113
  141. data/lib/wordlist/parsers.rb +0 -74
  142. data/lib/wordlist/runners/list.rb +0 -116
  143. data/lib/wordlist/runners/runner.rb +0 -67
  144. data/lib/wordlist/runners.rb +0 -2
  145. data/scripts/benchmark +0 -59
  146. data/scripts/text/comedy_of_errors.txt +0 -4011
  147. data/spec/classes/parser_class.rb +0 -7
  148. data/spec/classes/test_list.rb +0 -9
  149. data/spec/flat_file_spec.rb +0 -25
  150. data/spec/list_spec.rb +0 -58
  151. data/spec/mutator_spec.rb +0 -43
  152. data/spec/parsers_spec.rb +0 -118
@@ -0,0 +1,50 @@
1
+ require 'spec_helper'
2
+ require 'wordlist/format'
3
+
4
+ describe Wordlist::Format do
5
+ describe ".infer" do
6
+ context "when given a path ending in '.txt'" do
7
+ it "must return :txt" do
8
+ expect(subject.infer("path/to/file.txt")).to eq(:txt)
9
+ end
10
+ end
11
+
12
+ context "when given a path ending in '.gz'" do
13
+ it "must return :gzip" do
14
+ expect(subject.infer("path/to/file.gz")).to eq(:gzip)
15
+ end
16
+ end
17
+
18
+ context "when given a path ending in '.bz2'" do
19
+ it "must return :bzip2" do
20
+ expect(subject.infer("path/to/file.bz2")).to eq(:bzip2)
21
+ end
22
+ end
23
+
24
+ context "when given a path ending in '.xz'" do
25
+ it "must return :xz" do
26
+ expect(subject.infer("path/to/file.xz")).to eq(:xz)
27
+ end
28
+ end
29
+
30
+ context "when given a path ending in another file extension" do
31
+ let(:path) { "path/to/file.foo" }
32
+
33
+ it do
34
+ expect {
35
+ subject.infer(path)
36
+ }.to raise_error(Wordlist::UnknownFormat,"could not infer the format of file: #{path.inspect}")
37
+ end
38
+ end
39
+
40
+ context "when given a path has no file extension" do
41
+ let(:path) { "path/to/file" }
42
+
43
+ it do
44
+ expect {
45
+ subject.infer(path)
46
+ }.to raise_error(Wordlist::UnknownFormat,"could not infer the format of file: #{path.inspect}")
47
+ end
48
+ end
49
+ end
50
+ end
data/spec/helpers/text.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  module Helpers
2
- TEXT_DIR = File.expand_path(File.join(File.dirname(__FILE__),'..','text'))
2
+ TEXT_DIR = ::File.expand_path(::File.join(::File.dirname(__FILE__),'..','text'))
3
3
 
4
- SAMPLE_TEXT = File.join(TEXT_DIR,'sample.txt')
5
- PREVIOUS_WORDLIST = File.join(TEXT_DIR,'previous_wordlist.txt')
4
+ SAMPLE_TEXT = ::File.join(TEXT_DIR,'sample.txt')
5
+ PREVIOUS_WORDLIST = ::File.join(TEXT_DIR,'previous_wordlist.txt')
6
6
  end
@@ -5,14 +5,14 @@ module Helpers
5
5
  def wordlist_tempfile(existing_file=nil)
6
6
  path = Tempfile.new('wordlist').path
7
7
 
8
- FileUtils.cp(existing_file,path) if existing_file
8
+ ::FileUtils.cp(existing_file,path) if existing_file
9
9
  return path
10
10
  end
11
11
 
12
12
  def should_contain_words(path,expected)
13
13
  words = []
14
14
 
15
- File.open(path) do |file|
15
+ ::File.open(path) do |file|
16
16
  file.each_line do |line|
17
17
  words << line.chomp
18
18
  end
@@ -0,0 +1,70 @@
1
+ require 'spec_helper'
2
+ require 'wordlist/lexer/lang'
3
+
4
+ describe Wordlist::Lexer::Lang do
5
+ describe ".default" do
6
+ subject { described_class }
7
+
8
+ context "when LANG is set" do
9
+ context "and is of the form xx" do
10
+ let(:env) { {'LANG' => 'xx'} }
11
+
12
+ before { stub_const('ENV', env) }
13
+
14
+ it "must return xx as a Symbol" do
15
+ expect(subject.default).to be(:xx)
16
+ end
17
+ end
18
+
19
+ context "and is of the form xx_YY" do
20
+ let(:env) { {'LANG' => 'xx_YY'} }
21
+
22
+ before { stub_const('ENV', env) }
23
+
24
+ it "must return xx as a Symbol" do
25
+ expect(subject.default).to be(:xx)
26
+ end
27
+ end
28
+
29
+ context "and is of the form xx_YY.UTF-8" do
30
+ let(:env) { {'LANG' => 'xx_YY.UTF-8'} }
31
+
32
+ before { stub_const('ENV', env) }
33
+
34
+ it "must return xx as a Symbol" do
35
+ expect(subject.default).to be(:xx)
36
+ end
37
+ end
38
+
39
+ context "and is of the form C.UTF-8" do
40
+ let(:env) { {'LANG' => 'C.UTF-8'} }
41
+
42
+ before { stub_const('ENV', env) }
43
+
44
+ it "must return :en" do
45
+ expect(subject.default).to be(:en)
46
+ end
47
+ end
48
+ end
49
+
50
+ context "when LANG is C" do
51
+ let(:env) { {'LANG' => 'C'} }
52
+
53
+ before { stub_const('ENV', env) }
54
+
55
+ it "must default to :en" do
56
+ expect(subject.default).to be(:en)
57
+ end
58
+ end
59
+
60
+ context "when LANG is not set" do
61
+ let(:env) { {} }
62
+
63
+ before { stub_const('ENV', env) }
64
+
65
+ it "must default to :en" do
66
+ expect(subject.default).to be(:en)
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,77 @@
1
+ require 'spec_helper'
2
+ require 'wordlist/lexer/stop_words'
3
+
4
+ describe Wordlist::Lexer::StopWords do
5
+ describe "DIRECTORY" do
6
+ subject { described_class::DIRECTORY }
7
+
8
+ it "must exist" do
9
+ expect(File.directory?(subject)).to be(true)
10
+ end
11
+ end
12
+
13
+ describe ".path_for" do
14
+ let(:lang) { :en }
15
+
16
+ subject { described_class.path_for(lang) }
17
+
18
+ it "must return the path to the .txt file in DIRECTORY" do
19
+ expect(File.dirname(subject)).to eq(described_class::DIRECTORY)
20
+ expect(File.basename(subject)).to eq("#{lang}.txt")
21
+ end
22
+ end
23
+
24
+ describe ".read" do
25
+ context "when given a supported language" do
26
+ let(:lang) { :en }
27
+
28
+ let(:expected_words) do
29
+ File.readlines(subject.path_for(lang)).map(&:chomp)
30
+ end
31
+
32
+ it "must return the words in the .txt file for the language" do
33
+ expect(subject.read(lang)).to eq(expected_words)
34
+ end
35
+ end
36
+
37
+ context "when given an invalid language" do
38
+ let(:lang) { :foo }
39
+
40
+ it do
41
+ expect {
42
+ subject.read(lang)
43
+ }.to raise_error(Wordlist::UnsupportedLanguage,"unsupported language: #{lang}")
44
+ end
45
+ end
46
+ end
47
+
48
+ describe ".[]" do
49
+ context "when given a supported language" do
50
+ let(:lang) { :en }
51
+
52
+ let(:expected_words) do
53
+ File.readlines(subject.path_for(lang)).map(&:chomp)
54
+ end
55
+
56
+ it "must return the words in the .txt file for the language" do
57
+ expect(subject[lang]).to eq(expected_words)
58
+ end
59
+
60
+ context "when called multiple times" do
61
+ it "must return the same cached stop words" do
62
+ expect(subject[lang]).to be(subject[lang])
63
+ end
64
+ end
65
+ end
66
+
67
+ context "when given an invalid language" do
68
+ let(:lang) { :foo }
69
+
70
+ it do
71
+ expect {
72
+ subject[lang]
73
+ }.to raise_error(Wordlist::UnsupportedLanguage,"unsupported language: #{lang}")
74
+ end
75
+ end
76
+ end
77
+ end