corpus-processor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,37 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Processor do
4
- subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
5
-
6
- describe "#process" do
7
- subject { processor.process(corpus) }
8
-
9
- let(:corpus) { "Some corpus" }
10
- let(:processed_corpus) {
11
- <<-CORPUS
12
- Some O
13
- corpus O
14
- CORPUS
15
- }
16
- let(:tokens) {
17
- [
18
- CorpusProcessor::Token.new("Some"),
19
- CorpusProcessor::Token.new("corpus"),
20
- ]
21
- }
22
- let(:parser) { double :parser }
23
- let(:generator) { double :generator }
24
-
25
- specify {
26
- parser.should_receive(:parse)
27
- .with(corpus)
28
- .and_return(tokens)
29
-
30
- generator.should_receive(:generate)
31
- .with(tokens)
32
- .and_return(processed_corpus)
33
-
34
- subject.should == processed_corpus
35
- }
36
- end
37
- end
@@ -1,8 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Token do
4
- subject { CorpusProcessor::Token.new }
5
-
6
- it { should respond_to(:word) }
7
- it { should respond_to(:category) }
8
- end
@@ -1,121 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Tokenizer do
4
- subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
5
-
6
- describe "#tokenize" do
7
- subject { tokenizer.tokenize(text, category) }
8
-
9
- let(:category) { nil }
10
-
11
- context "empty string" do
12
- let(:text) { "" }
13
-
14
- it { should == [] }
15
- end
16
-
17
- context "one word" do
18
- let(:text) { "banana" }
19
-
20
- it { should == [CorpusProcessor::Token.new("banana")] }
21
- end
22
-
23
- context "two words" do
24
- let(:text) { "good banana" }
25
-
26
- it { should == [
27
- CorpusProcessor::Token.new("good"),
28
- CorpusProcessor::Token.new("banana"),
29
- ] }
30
- end
31
-
32
- context "ponctuation" do
33
- let(:text) { "good, banana" }
34
-
35
- it { should == [
36
- CorpusProcessor::Token.new("good"),
37
- CorpusProcessor::Token.new("banana"),
38
- ] }
39
- end
40
-
41
- context "default category" do
42
- let(:text) { "Google" }
43
- let(:category) { :organization }
44
-
45
- it { should == [
46
- CorpusProcessor::Token.new("Google", :organization),
47
- ] }
48
- end
49
-
50
- context "with tags" do
51
- let(:text) { "good<lalala/>, banana" }
52
-
53
- it { should == [
54
- CorpusProcessor::Token.new("good"),
55
- CorpusProcessor::Token.new("banana"),
56
- ] }
57
- end
58
- end
59
-
60
- describe "#strip_tags" do
61
- subject { tokenizer.strip_tags(text) }
62
-
63
- context "empty text" do
64
- let(:text) { "" }
65
-
66
- it { should == "" }
67
- end
68
-
69
- context "self closed tag" do
70
- let(:text) { "<br/>" }
71
-
72
- it { should == "" }
73
- end
74
-
75
- context "tag with content" do
76
- let(:text) { "<p>Some text</p>" }
77
-
78
- it { should == "Some text" }
79
- end
80
-
81
- context "content after tag" do
82
- let(:text) { "<p>Some<br/>text</p>" }
83
-
84
- it { should == "Some text" }
85
- end
86
- end
87
-
88
- describe "#join_lines" do
89
- subject { tokenizer.join_lines(text) }
90
-
91
- context "empty text" do
92
- let(:text) { "" }
93
-
94
- it { should == "" }
95
- end
96
-
97
- context "one word" do
98
- let(:text) { "banana" }
99
-
100
- it { should == "banana" }
101
- end
102
-
103
- context "two lines" do
104
- let(:text) { "banana\nquiabo" }
105
-
106
- it { should == "banana quiabo" }
107
- end
108
-
109
- context "line with empty space" do
110
- let(:text) { "banana\n \nquiabo" }
111
-
112
- it { should == "banana quiabo" }
113
- end
114
-
115
- context "leading spaces" do
116
- let(:text) { " \n banana\n \nquiabo \n" }
117
-
118
- it { should == "banana quiabo" }
119
- end
120
- end
121
- end
@@ -1,68 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Traverser do
4
- subject(:traverser) { CorpusProcessor::Traverser.new }
5
-
6
- describe "#traverse" do
7
- subject { traverser.traverse(text, regexp) }
8
-
9
- context "empty text" do
10
- let(:text) { "" }
11
- let(:regexp) { // }
12
-
13
- specify {
14
- expect { |mock_block|
15
- traverser.traverse(text, regexp, &mock_block)
16
- }.not_to yield_control
17
- }
18
- end
19
-
20
- context "simple text" do
21
- let(:text) { "abc" }
22
- let(:regexp) { /b/ }
23
-
24
- specify {
25
- expect { |mock_block|
26
- traverser.traverse(text, regexp, &mock_block)
27
- }.to yield_successive_args "a", text.match(regexp), "c"
28
- }
29
- end
30
-
31
- context "two matches" do
32
- let(:text) { "abcbd" }
33
- let(:regexp) { /b/ }
34
-
35
- specify {
36
- expect { |mock_block|
37
- traverser.traverse(text, regexp, &mock_block)
38
- }.to yield_successive_args "a",
39
- text.match(regexp),
40
- "c",
41
- text[2..-1].match(regexp),
42
- "d"
43
- }
44
- end
45
-
46
- context "match in beginning" do
47
- let(:text) { "bc" }
48
- let(:regexp) { /b/ }
49
-
50
- specify {
51
- expect { |mock_block|
52
- traverser.traverse(text, regexp, &mock_block)
53
- }.to yield_successive_args text.match(regexp), "c"
54
- }
55
- end
56
-
57
- context "match in ending" do
58
- let(:text) { "bc" }
59
- let(:regexp) { /c/ }
60
-
61
- specify {
62
- expect { |mock_block|
63
- traverser.traverse(text, regexp, &mock_block)
64
- }.to yield_successive_args "b", text.match(regexp)
65
- }
66
- end
67
- end
68
- end