corpus-processor 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
data/spec/unit/processor.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Processor do
|
4
|
-
subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
|
5
|
-
|
6
|
-
describe "#process" do
|
7
|
-
subject { processor.process(corpus) }
|
8
|
-
|
9
|
-
let(:corpus) { "Some corpus" }
|
10
|
-
let(:processed_corpus) {
|
11
|
-
<<-CORPUS
|
12
|
-
Some O
|
13
|
-
corpus O
|
14
|
-
CORPUS
|
15
|
-
}
|
16
|
-
let(:tokens) {
|
17
|
-
[
|
18
|
-
CorpusProcessor::Token.new("Some"),
|
19
|
-
CorpusProcessor::Token.new("corpus"),
|
20
|
-
]
|
21
|
-
}
|
22
|
-
let(:parser) { double :parser }
|
23
|
-
let(:generator) { double :generator }
|
24
|
-
|
25
|
-
specify {
|
26
|
-
parser.should_receive(:parse)
|
27
|
-
.with(corpus)
|
28
|
-
.and_return(tokens)
|
29
|
-
|
30
|
-
generator.should_receive(:generate)
|
31
|
-
.with(tokens)
|
32
|
-
.and_return(processed_corpus)
|
33
|
-
|
34
|
-
subject.should == processed_corpus
|
35
|
-
}
|
36
|
-
end
|
37
|
-
end
|
data/spec/unit/token_spec.rb
DELETED
data/spec/unit/tokenizer_spec.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Tokenizer do
|
4
|
-
subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
|
5
|
-
|
6
|
-
describe "#tokenize" do
|
7
|
-
subject { tokenizer.tokenize(text, category) }
|
8
|
-
|
9
|
-
let(:category) { nil }
|
10
|
-
|
11
|
-
context "empty string" do
|
12
|
-
let(:text) { "" }
|
13
|
-
|
14
|
-
it { should == [] }
|
15
|
-
end
|
16
|
-
|
17
|
-
context "one word" do
|
18
|
-
let(:text) { "banana" }
|
19
|
-
|
20
|
-
it { should == [CorpusProcessor::Token.new("banana")] }
|
21
|
-
end
|
22
|
-
|
23
|
-
context "two words" do
|
24
|
-
let(:text) { "good banana" }
|
25
|
-
|
26
|
-
it { should == [
|
27
|
-
CorpusProcessor::Token.new("good"),
|
28
|
-
CorpusProcessor::Token.new("banana"),
|
29
|
-
] }
|
30
|
-
end
|
31
|
-
|
32
|
-
context "ponctuation" do
|
33
|
-
let(:text) { "good, banana" }
|
34
|
-
|
35
|
-
it { should == [
|
36
|
-
CorpusProcessor::Token.new("good"),
|
37
|
-
CorpusProcessor::Token.new("banana"),
|
38
|
-
] }
|
39
|
-
end
|
40
|
-
|
41
|
-
context "default category" do
|
42
|
-
let(:text) { "Google" }
|
43
|
-
let(:category) { :organization }
|
44
|
-
|
45
|
-
it { should == [
|
46
|
-
CorpusProcessor::Token.new("Google", :organization),
|
47
|
-
] }
|
48
|
-
end
|
49
|
-
|
50
|
-
context "with tags" do
|
51
|
-
let(:text) { "good<lalala/>, banana" }
|
52
|
-
|
53
|
-
it { should == [
|
54
|
-
CorpusProcessor::Token.new("good"),
|
55
|
-
CorpusProcessor::Token.new("banana"),
|
56
|
-
] }
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
describe "#strip_tags" do
|
61
|
-
subject { tokenizer.strip_tags(text) }
|
62
|
-
|
63
|
-
context "empty text" do
|
64
|
-
let(:text) { "" }
|
65
|
-
|
66
|
-
it { should == "" }
|
67
|
-
end
|
68
|
-
|
69
|
-
context "self closed tag" do
|
70
|
-
let(:text) { "<br/>" }
|
71
|
-
|
72
|
-
it { should == "" }
|
73
|
-
end
|
74
|
-
|
75
|
-
context "tag with content" do
|
76
|
-
let(:text) { "<p>Some text</p>" }
|
77
|
-
|
78
|
-
it { should == "Some text" }
|
79
|
-
end
|
80
|
-
|
81
|
-
context "content after tag" do
|
82
|
-
let(:text) { "<p>Some<br/>text</p>" }
|
83
|
-
|
84
|
-
it { should == "Some text" }
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
describe "#join_lines" do
|
89
|
-
subject { tokenizer.join_lines(text) }
|
90
|
-
|
91
|
-
context "empty text" do
|
92
|
-
let(:text) { "" }
|
93
|
-
|
94
|
-
it { should == "" }
|
95
|
-
end
|
96
|
-
|
97
|
-
context "one word" do
|
98
|
-
let(:text) { "banana" }
|
99
|
-
|
100
|
-
it { should == "banana" }
|
101
|
-
end
|
102
|
-
|
103
|
-
context "two lines" do
|
104
|
-
let(:text) { "banana\nquiabo" }
|
105
|
-
|
106
|
-
it { should == "banana quiabo" }
|
107
|
-
end
|
108
|
-
|
109
|
-
context "line with empty space" do
|
110
|
-
let(:text) { "banana\n \nquiabo" }
|
111
|
-
|
112
|
-
it { should == "banana quiabo" }
|
113
|
-
end
|
114
|
-
|
115
|
-
context "leading spaces" do
|
116
|
-
let(:text) { " \n banana\n \nquiabo \n" }
|
117
|
-
|
118
|
-
it { should == "banana quiabo" }
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
data/spec/unit/traverser_spec.rb
DELETED
@@ -1,68 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Traverser do
|
4
|
-
subject(:traverser) { CorpusProcessor::Traverser.new }
|
5
|
-
|
6
|
-
describe "#traverse" do
|
7
|
-
subject { traverser.traverse(text, regexp) }
|
8
|
-
|
9
|
-
context "empty text" do
|
10
|
-
let(:text) { "" }
|
11
|
-
let(:regexp) { // }
|
12
|
-
|
13
|
-
specify {
|
14
|
-
expect { |mock_block|
|
15
|
-
traverser.traverse(text, regexp, &mock_block)
|
16
|
-
}.not_to yield_control
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
context "simple text" do
|
21
|
-
let(:text) { "abc" }
|
22
|
-
let(:regexp) { /b/ }
|
23
|
-
|
24
|
-
specify {
|
25
|
-
expect { |mock_block|
|
26
|
-
traverser.traverse(text, regexp, &mock_block)
|
27
|
-
}.to yield_successive_args "a", text.match(regexp), "c"
|
28
|
-
}
|
29
|
-
end
|
30
|
-
|
31
|
-
context "two matches" do
|
32
|
-
let(:text) { "abcbd" }
|
33
|
-
let(:regexp) { /b/ }
|
34
|
-
|
35
|
-
specify {
|
36
|
-
expect { |mock_block|
|
37
|
-
traverser.traverse(text, regexp, &mock_block)
|
38
|
-
}.to yield_successive_args "a",
|
39
|
-
text.match(regexp),
|
40
|
-
"c",
|
41
|
-
text[2..-1].match(regexp),
|
42
|
-
"d"
|
43
|
-
}
|
44
|
-
end
|
45
|
-
|
46
|
-
context "match in beginning" do
|
47
|
-
let(:text) { "bc" }
|
48
|
-
let(:regexp) { /b/ }
|
49
|
-
|
50
|
-
specify {
|
51
|
-
expect { |mock_block|
|
52
|
-
traverser.traverse(text, regexp, &mock_block)
|
53
|
-
}.to yield_successive_args text.match(regexp), "c"
|
54
|
-
}
|
55
|
-
end
|
56
|
-
|
57
|
-
context "match in ending" do
|
58
|
-
let(:text) { "bc" }
|
59
|
-
let(:regexp) { /c/ }
|
60
|
-
|
61
|
-
specify {
|
62
|
-
expect { |mock_block|
|
63
|
-
traverser.traverse(text, regexp, &mock_block)
|
64
|
-
}.to yield_successive_args "b", text.match(regexp)
|
65
|
-
}
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|