corpus-processor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
data/spec/unit/processor.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Processor do
|
4
|
-
subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
|
5
|
-
|
6
|
-
describe "#process" do
|
7
|
-
subject { processor.process(corpus) }
|
8
|
-
|
9
|
-
let(:corpus) { "Some corpus" }
|
10
|
-
let(:processed_corpus) {
|
11
|
-
<<-CORPUS
|
12
|
-
Some O
|
13
|
-
corpus O
|
14
|
-
CORPUS
|
15
|
-
}
|
16
|
-
let(:tokens) {
|
17
|
-
[
|
18
|
-
CorpusProcessor::Token.new("Some"),
|
19
|
-
CorpusProcessor::Token.new("corpus"),
|
20
|
-
]
|
21
|
-
}
|
22
|
-
let(:parser) { double :parser }
|
23
|
-
let(:generator) { double :generator }
|
24
|
-
|
25
|
-
specify {
|
26
|
-
parser.should_receive(:parse)
|
27
|
-
.with(corpus)
|
28
|
-
.and_return(tokens)
|
29
|
-
|
30
|
-
generator.should_receive(:generate)
|
31
|
-
.with(tokens)
|
32
|
-
.and_return(processed_corpus)
|
33
|
-
|
34
|
-
subject.should == processed_corpus
|
35
|
-
}
|
36
|
-
end
|
37
|
-
end
|
data/spec/unit/token_spec.rb
DELETED
data/spec/unit/tokenizer_spec.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Tokenizer do
|
4
|
-
subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
|
5
|
-
|
6
|
-
describe "#tokenize" do
|
7
|
-
subject { tokenizer.tokenize(text, category) }
|
8
|
-
|
9
|
-
let(:category) { nil }
|
10
|
-
|
11
|
-
context "empty string" do
|
12
|
-
let(:text) { "" }
|
13
|
-
|
14
|
-
it { should == [] }
|
15
|
-
end
|
16
|
-
|
17
|
-
context "one word" do
|
18
|
-
let(:text) { "banana" }
|
19
|
-
|
20
|
-
it { should == [CorpusProcessor::Token.new("banana")] }
|
21
|
-
end
|
22
|
-
|
23
|
-
context "two words" do
|
24
|
-
let(:text) { "good banana" }
|
25
|
-
|
26
|
-
it { should == [
|
27
|
-
CorpusProcessor::Token.new("good"),
|
28
|
-
CorpusProcessor::Token.new("banana"),
|
29
|
-
] }
|
30
|
-
end
|
31
|
-
|
32
|
-
context "ponctuation" do
|
33
|
-
let(:text) { "good, banana" }
|
34
|
-
|
35
|
-
it { should == [
|
36
|
-
CorpusProcessor::Token.new("good"),
|
37
|
-
CorpusProcessor::Token.new("banana"),
|
38
|
-
] }
|
39
|
-
end
|
40
|
-
|
41
|
-
context "default category" do
|
42
|
-
let(:text) { "Google" }
|
43
|
-
let(:category) { :organization }
|
44
|
-
|
45
|
-
it { should == [
|
46
|
-
CorpusProcessor::Token.new("Google", :organization),
|
47
|
-
] }
|
48
|
-
end
|
49
|
-
|
50
|
-
context "with tags" do
|
51
|
-
let(:text) { "good<lalala/>, banana" }
|
52
|
-
|
53
|
-
it { should == [
|
54
|
-
CorpusProcessor::Token.new("good"),
|
55
|
-
CorpusProcessor::Token.new("banana"),
|
56
|
-
] }
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
describe "#strip_tags" do
|
61
|
-
subject { tokenizer.strip_tags(text) }
|
62
|
-
|
63
|
-
context "empty text" do
|
64
|
-
let(:text) { "" }
|
65
|
-
|
66
|
-
it { should == "" }
|
67
|
-
end
|
68
|
-
|
69
|
-
context "self closed tag" do
|
70
|
-
let(:text) { "<br/>" }
|
71
|
-
|
72
|
-
it { should == "" }
|
73
|
-
end
|
74
|
-
|
75
|
-
context "tag with content" do
|
76
|
-
let(:text) { "<p>Some text</p>" }
|
77
|
-
|
78
|
-
it { should == "Some text" }
|
79
|
-
end
|
80
|
-
|
81
|
-
context "content after tag" do
|
82
|
-
let(:text) { "<p>Some<br/>text</p>" }
|
83
|
-
|
84
|
-
it { should == "Some text" }
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
describe "#join_lines" do
|
89
|
-
subject { tokenizer.join_lines(text) }
|
90
|
-
|
91
|
-
context "empty text" do
|
92
|
-
let(:text) { "" }
|
93
|
-
|
94
|
-
it { should == "" }
|
95
|
-
end
|
96
|
-
|
97
|
-
context "one word" do
|
98
|
-
let(:text) { "banana" }
|
99
|
-
|
100
|
-
it { should == "banana" }
|
101
|
-
end
|
102
|
-
|
103
|
-
context "two lines" do
|
104
|
-
let(:text) { "banana\nquiabo" }
|
105
|
-
|
106
|
-
it { should == "banana quiabo" }
|
107
|
-
end
|
108
|
-
|
109
|
-
context "line with empty space" do
|
110
|
-
let(:text) { "banana\n \nquiabo" }
|
111
|
-
|
112
|
-
it { should == "banana quiabo" }
|
113
|
-
end
|
114
|
-
|
115
|
-
context "leading spaces" do
|
116
|
-
let(:text) { " \n banana\n \nquiabo \n" }
|
117
|
-
|
118
|
-
it { should == "banana quiabo" }
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
data/spec/unit/traverser_spec.rb
DELETED
@@ -1,68 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Traverser do
|
4
|
-
subject(:traverser) { CorpusProcessor::Traverser.new }
|
5
|
-
|
6
|
-
describe "#traverse" do
|
7
|
-
subject { traverser.traverse(text, regexp) }
|
8
|
-
|
9
|
-
context "empty text" do
|
10
|
-
let(:text) { "" }
|
11
|
-
let(:regexp) { // }
|
12
|
-
|
13
|
-
specify {
|
14
|
-
expect { |mock_block|
|
15
|
-
traverser.traverse(text, regexp, &mock_block)
|
16
|
-
}.not_to yield_control
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
context "simple text" do
|
21
|
-
let(:text) { "abc" }
|
22
|
-
let(:regexp) { /b/ }
|
23
|
-
|
24
|
-
specify {
|
25
|
-
expect { |mock_block|
|
26
|
-
traverser.traverse(text, regexp, &mock_block)
|
27
|
-
}.to yield_successive_args "a", text.match(regexp), "c"
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
context "two matches" do
|
32
|
-
let(:text) { "abcbd" }
|
33
|
-
let(:regexp) { /b/ }
|
34
|
-
|
35
|
-
specify {
|
36
|
-
expect { |mock_block|
|
37
|
-
traverser.traverse(text, regexp, &mock_block)
|
38
|
-
}.to yield_successive_args "a",
|
39
|
-
text.match(regexp),
|
40
|
-
"c",
|
41
|
-
text[2..-1].match(regexp),
|
42
|
-
"d"
|
43
|
-
}
|
44
|
-
end
|
45
|
-
|
46
|
-
context "match in beginning" do
|
47
|
-
let(:text) { "bc" }
|
48
|
-
let(:regexp) { /b/ }
|
49
|
-
|
50
|
-
specify {
|
51
|
-
expect { |mock_block|
|
52
|
-
traverser.traverse(text, regexp, &mock_block)
|
53
|
-
}.to yield_successive_args text.match(regexp), "c"
|
54
|
-
}
|
55
|
-
end
|
56
|
-
|
57
|
-
context "match in ending" do
|
58
|
-
let(:text) { "bc" }
|
59
|
-
let(:regexp) { /c/ }
|
60
|
-
|
61
|
-
specify {
|
62
|
-
expect { |mock_block|
|
63
|
-
traverser.traverse(text, regexp, &mock_block)
|
64
|
-
}.to yield_successive_args "b", text.match(regexp)
|
65
|
-
}
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|