wordlist 0.1.1 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/ruby.yml +28 -0
- data/.gitignore +6 -3
- data/ChangeLog.md +55 -1
- data/Gemfile +15 -0
- data/LICENSE.txt +1 -3
- data/README.md +301 -60
- data/Rakefile +7 -32
- data/benchmarks.rb +115 -0
- data/bin/wordlist +4 -7
- data/data/stop_words/ar.txt +104 -0
- data/data/stop_words/bg.txt +259 -0
- data/data/stop_words/bn.txt +363 -0
- data/data/stop_words/ca.txt +126 -0
- data/data/stop_words/cs.txt +138 -0
- data/data/stop_words/da.txt +101 -0
- data/data/stop_words/de.txt +129 -0
- data/data/stop_words/el.txt +79 -0
- data/data/stop_words/en.txt +175 -0
- data/data/stop_words/es.txt +178 -0
- data/data/stop_words/eu.txt +98 -0
- data/data/stop_words/fa.txt +332 -0
- data/data/stop_words/fi.txt +747 -0
- data/data/stop_words/fr.txt +116 -0
- data/data/stop_words/ga.txt +109 -0
- data/data/stop_words/gl.txt +160 -0
- data/data/stop_words/he.txt +499 -0
- data/data/stop_words/hi.txt +97 -0
- data/data/stop_words/hr.txt +179 -0
- data/data/stop_words/hu.txt +35 -0
- data/data/stop_words/hy.txt +45 -0
- data/data/stop_words/id.txt +357 -0
- data/data/stop_words/it.txt +134 -0
- data/data/stop_words/ja.txt +44 -0
- data/data/stop_words/ko.txt +677 -0
- data/data/stop_words/ku.txt +63 -0
- data/data/stop_words/lt.txt +507 -0
- data/data/stop_words/lv.txt +163 -0
- data/data/stop_words/mr.txt +99 -0
- data/data/stop_words/nl.txt +48 -0
- data/data/stop_words/no.txt +172 -0
- data/data/stop_words/pl.txt +138 -0
- data/data/stop_words/pt.txt +147 -0
- data/data/stop_words/ro.txt +281 -0
- data/data/stop_words/ru.txt +421 -0
- data/data/stop_words/sk.txt +173 -0
- data/data/stop_words/sv.txt +386 -0
- data/data/stop_words/th.txt +115 -0
- data/data/stop_words/tr.txt +114 -0
- data/data/stop_words/uk.txt +28 -0
- data/data/stop_words/ur.txt +513 -0
- data/data/stop_words/zh.txt +125 -0
- data/gemspec.yml +13 -12
- data/lib/wordlist/abstract_wordlist.rb +25 -0
- data/lib/wordlist/builder.rb +172 -138
- data/lib/wordlist/cli.rb +459 -0
- data/lib/wordlist/compression/reader.rb +72 -0
- data/lib/wordlist/compression/writer.rb +80 -0
- data/lib/wordlist/exceptions.rb +31 -0
- data/lib/wordlist/file.rb +177 -0
- data/lib/wordlist/format.rb +39 -0
- data/lib/wordlist/lexer/lang.rb +34 -0
- data/lib/wordlist/lexer/stop_words.rb +69 -0
- data/lib/wordlist/lexer.rb +221 -0
- data/lib/wordlist/list_methods.rb +462 -0
- data/lib/wordlist/modifiers/capitalize.rb +46 -0
- data/lib/wordlist/modifiers/downcase.rb +46 -0
- data/lib/wordlist/modifiers/gsub.rb +52 -0
- data/lib/wordlist/modifiers/modifier.rb +44 -0
- data/lib/wordlist/modifiers/mutate.rb +134 -0
- data/lib/wordlist/modifiers/mutate_case.rb +26 -0
- data/lib/wordlist/modifiers/sub.rb +98 -0
- data/lib/wordlist/modifiers/tr.rb +72 -0
- data/lib/wordlist/modifiers/upcase.rb +46 -0
- data/lib/wordlist/modifiers.rb +9 -0
- data/lib/wordlist/operators/binary_operator.rb +39 -0
- data/lib/wordlist/operators/concat.rb +48 -0
- data/lib/wordlist/operators/intersect.rb +56 -0
- data/lib/wordlist/operators/operator.rb +29 -0
- data/lib/wordlist/operators/power.rb +73 -0
- data/lib/wordlist/operators/product.rb +51 -0
- data/lib/wordlist/operators/subtract.rb +55 -0
- data/lib/wordlist/operators/unary_operator.rb +30 -0
- data/lib/wordlist/operators/union.rb +62 -0
- data/lib/wordlist/operators/unique.rb +53 -0
- data/lib/wordlist/operators.rb +8 -0
- data/lib/wordlist/unique_filter.rb +41 -61
- data/lib/wordlist/version.rb +4 -2
- data/lib/wordlist/words.rb +72 -0
- data/lib/wordlist.rb +104 -2
- data/spec/abstract_list_spec.rb +18 -0
- data/spec/builder_spec.rb +220 -76
- data/spec/cli_spec.rb +802 -0
- data/spec/compression/reader_spec.rb +137 -0
- data/spec/compression/writer_spec.rb +194 -0
- data/spec/file_spec.rb +269 -0
- data/spec/fixtures/wordlist.txt +15 -0
- data/spec/fixtures/wordlist.txt.bz2 +0 -0
- data/spec/fixtures/wordlist.txt.gz +0 -0
- data/spec/fixtures/wordlist.txt.xz +0 -0
- data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
- data/spec/fixtures/wordlist_with_comments.txt +19 -0
- data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
- data/spec/format_spec.rb +50 -0
- data/spec/helpers/text.rb +3 -3
- data/spec/helpers/wordlist.rb +2 -2
- data/spec/lexer/lang_spec.rb +70 -0
- data/spec/lexer/stop_words_spec.rb +77 -0
- data/spec/lexer_spec.rb +718 -0
- data/spec/list_methods_spec.rb +181 -0
- data/spec/modifiers/capitalize_spec.rb +27 -0
- data/spec/modifiers/downcase_spec.rb +27 -0
- data/spec/modifiers/gsub_spec.rb +59 -0
- data/spec/modifiers/modifier_spec.rb +20 -0
- data/spec/modifiers/mutate_case_spec.rb +46 -0
- data/spec/modifiers/mutate_spec.rb +39 -0
- data/spec/modifiers/sub_spec.rb +98 -0
- data/spec/modifiers/tr_spec.rb +46 -0
- data/spec/modifiers/upcase_spec.rb +27 -0
- data/spec/operators/binary_operator_spec.rb +19 -0
- data/spec/operators/concat_spec.rb +26 -0
- data/spec/operators/intersect_spec.rb +37 -0
- data/spec/operators/operator_spec.rb +16 -0
- data/spec/operators/power_spec.rb +57 -0
- data/spec/operators/product_spec.rb +39 -0
- data/spec/operators/subtract_spec.rb +37 -0
- data/spec/operators/unary_operator_spec.rb +14 -0
- data/spec/operators/union_spec.rb +37 -0
- data/spec/operators/unique_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -1
- data/spec/unique_filter_spec.rb +108 -18
- data/spec/wordlist_spec.rb +55 -3
- data/spec/words_spec.rb +41 -0
- data/wordlist.gemspec +1 -0
- metadata +164 -126
- data/lib/wordlist/builders/website.rb +0 -216
- data/lib/wordlist/builders.rb +0 -1
- data/lib/wordlist/flat_file.rb +0 -47
- data/lib/wordlist/list.rb +0 -162
- data/lib/wordlist/mutator.rb +0 -113
- data/lib/wordlist/parsers.rb +0 -74
- data/lib/wordlist/runners/list.rb +0 -116
- data/lib/wordlist/runners/runner.rb +0 -67
- data/lib/wordlist/runners.rb +0 -2
- data/scripts/benchmark +0 -59
- data/scripts/text/comedy_of_errors.txt +0 -4011
- data/spec/classes/parser_class.rb +0 -7
- data/spec/classes/test_list.rb +0 -9
- data/spec/flat_file_spec.rb +0 -25
- data/spec/list_spec.rb +0 -58
- data/spec/mutator_spec.rb +0 -43
- data/spec/parsers_spec.rb +0 -118
data/spec/format_spec.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'wordlist/format'
|
3
|
+
|
4
|
+
describe Wordlist::Format do
|
5
|
+
describe ".infer" do
|
6
|
+
context "when given a path ending in '.txt'" do
|
7
|
+
it "must return :txt" do
|
8
|
+
expect(subject.infer("path/to/file.txt")).to eq(:txt)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "when given a path ending in '.gz'" do
|
13
|
+
it "must return :gzip" do
|
14
|
+
expect(subject.infer("path/to/file.gz")).to eq(:gzip)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context "when given a path ending in '.bz2'" do
|
19
|
+
it "must return :bzip2" do
|
20
|
+
expect(subject.infer("path/to/file.bz2")).to eq(:bzip2)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
context "when given a path ending in '.xz'" do
|
25
|
+
it "must return :xz" do
|
26
|
+
expect(subject.infer("path/to/file.xz")).to eq(:xz)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context "when given a path ending in another file extension" do
|
31
|
+
let(:path) { "path/to/file.foo" }
|
32
|
+
|
33
|
+
it do
|
34
|
+
expect {
|
35
|
+
subject.infer(path)
|
36
|
+
}.to raise_error(Wordlist::UnknownFormat,"could not infer the format of file: #{path.inspect}")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
context "when given a path has no file extension" do
|
41
|
+
let(:path) { "path/to/file" }
|
42
|
+
|
43
|
+
it do
|
44
|
+
expect {
|
45
|
+
subject.infer(path)
|
46
|
+
}.to raise_error(Wordlist::UnknownFormat,"could not infer the format of file: #{path.inspect}")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/spec/helpers/text.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Helpers
|
2
|
-
TEXT_DIR = File.expand_path(File.join(File.dirname(__FILE__),'..','text'))
|
2
|
+
TEXT_DIR = ::File.expand_path(::File.join(::File.dirname(__FILE__),'..','text'))
|
3
3
|
|
4
|
-
SAMPLE_TEXT = File.join(TEXT_DIR,'sample.txt')
|
5
|
-
PREVIOUS_WORDLIST = File.join(TEXT_DIR,'previous_wordlist.txt')
|
4
|
+
SAMPLE_TEXT = ::File.join(TEXT_DIR,'sample.txt')
|
5
|
+
PREVIOUS_WORDLIST = ::File.join(TEXT_DIR,'previous_wordlist.txt')
|
6
6
|
end
|
data/spec/helpers/wordlist.rb
CHANGED
@@ -5,14 +5,14 @@ module Helpers
|
|
5
5
|
def wordlist_tempfile(existing_file=nil)
|
6
6
|
path = Tempfile.new('wordlist').path
|
7
7
|
|
8
|
-
FileUtils.cp(existing_file,path) if existing_file
|
8
|
+
::FileUtils.cp(existing_file,path) if existing_file
|
9
9
|
return path
|
10
10
|
end
|
11
11
|
|
12
12
|
def should_contain_words(path,expected)
|
13
13
|
words = []
|
14
14
|
|
15
|
-
File.open(path) do |file|
|
15
|
+
::File.open(path) do |file|
|
16
16
|
file.each_line do |line|
|
17
17
|
words << line.chomp
|
18
18
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'wordlist/lexer/lang'
|
3
|
+
|
4
|
+
describe Wordlist::Lexer::Lang do
|
5
|
+
describe ".default" do
|
6
|
+
subject { described_class }
|
7
|
+
|
8
|
+
context "when LANG is set" do
|
9
|
+
context "and is of the form xx" do
|
10
|
+
let(:env) { {'LANG' => 'xx'} }
|
11
|
+
|
12
|
+
before { stub_const('ENV', env) }
|
13
|
+
|
14
|
+
it "must return xx as a Symbol" do
|
15
|
+
expect(subject.default).to be(:xx)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "and is of the form xx_YY" do
|
20
|
+
let(:env) { {'LANG' => 'xx_YY'} }
|
21
|
+
|
22
|
+
before { stub_const('ENV', env) }
|
23
|
+
|
24
|
+
it "must return xx as a Symbol" do
|
25
|
+
expect(subject.default).to be(:xx)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "and is of the form xx_YY.UTF-8" do
|
30
|
+
let(:env) { {'LANG' => 'xx_YY.UTF-8'} }
|
31
|
+
|
32
|
+
before { stub_const('ENV', env) }
|
33
|
+
|
34
|
+
it "must return xx as a Symbol" do
|
35
|
+
expect(subject.default).to be(:xx)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
context "and is of the form C.UTF-8" do
|
40
|
+
let(:env) { {'LANG' => 'C.UTF-8'} }
|
41
|
+
|
42
|
+
before { stub_const('ENV', env) }
|
43
|
+
|
44
|
+
it "must return :en" do
|
45
|
+
expect(subject.default).to be(:en)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
context "when LANG is C" do
|
51
|
+
let(:env) { {'LANG' => 'C'} }
|
52
|
+
|
53
|
+
before { stub_const('ENV', env) }
|
54
|
+
|
55
|
+
it "must default to :en" do
|
56
|
+
expect(subject.default).to be(:en)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "when LANG is not set" do
|
61
|
+
let(:env) { {} }
|
62
|
+
|
63
|
+
before { stub_const('ENV', env) }
|
64
|
+
|
65
|
+
it "must default to :en" do
|
66
|
+
expect(subject.default).to be(:en)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'wordlist/lexer/stop_words'
|
3
|
+
|
4
|
+
describe Wordlist::Lexer::StopWords do
|
5
|
+
describe "DIRECTORY" do
|
6
|
+
subject { described_class::DIRECTORY }
|
7
|
+
|
8
|
+
it "must exist" do
|
9
|
+
expect(File.directory?(subject)).to be(true)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe ".path_for" do
|
14
|
+
let(:lang) { :en }
|
15
|
+
|
16
|
+
subject { described_class.path_for(lang) }
|
17
|
+
|
18
|
+
it "must return the path to the .txt file in DIRECTORY" do
|
19
|
+
expect(File.dirname(subject)).to eq(described_class::DIRECTORY)
|
20
|
+
expect(File.basename(subject)).to eq("#{lang}.txt")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe ".read" do
|
25
|
+
context "when given a supported language" do
|
26
|
+
let(:lang) { :en }
|
27
|
+
|
28
|
+
let(:expected_words) do
|
29
|
+
File.readlines(subject.path_for(lang)).map(&:chomp)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "must return the words in the .txt file for the language" do
|
33
|
+
expect(subject.read(lang)).to eq(expected_words)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "when given an invalid language" do
|
38
|
+
let(:lang) { :foo }
|
39
|
+
|
40
|
+
it do
|
41
|
+
expect {
|
42
|
+
subject.read(lang)
|
43
|
+
}.to raise_error(Wordlist::UnsupportedLanguage,"unsupported language: #{lang}")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe ".[]" do
|
49
|
+
context "when given a supported language" do
|
50
|
+
let(:lang) { :en }
|
51
|
+
|
52
|
+
let(:expected_words) do
|
53
|
+
File.readlines(subject.path_for(lang)).map(&:chomp)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "must return the words in the .txt file for the language" do
|
57
|
+
expect(subject[lang]).to eq(expected_words)
|
58
|
+
end
|
59
|
+
|
60
|
+
context "when called multiple times" do
|
61
|
+
it "must return the same cached stop words" do
|
62
|
+
expect(subject[lang]).to be(subject[lang])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "when given an invalid language" do
|
68
|
+
let(:lang) { :foo }
|
69
|
+
|
70
|
+
it do
|
71
|
+
expect {
|
72
|
+
subject[lang]
|
73
|
+
}.to raise_error(Wordlist::UnsupportedLanguage,"unsupported language: #{lang}")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|