open_nlp 0.0.7-java → 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -2
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +31 -0
- data/README.md +8 -1
- data/lib/open_nlp.rb +3 -3
- data/lib/open_nlp/categorizer.rb +7 -3
- data/lib/open_nlp/chunker.rb +19 -8
- data/lib/open_nlp/model.rb +13 -9
- data/lib/open_nlp/named_entity_detector.rb +6 -2
- data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
- data/lib/open_nlp/parser.rb +43 -33
- data/lib/open_nlp/parser/parse.rb +12 -21
- data/lib/open_nlp/pos_tagger.rb +5 -2
- data/lib/open_nlp/sentence_detector.rb +16 -6
- data/lib/open_nlp/tokenizer.rb +8 -3
- data/lib/open_nlp/tool.rb +1 -1
- data/lib/open_nlp/util.rb +1 -2
- data/lib/open_nlp/util/span.rb +5 -5
- data/lib/open_nlp/version.rb +1 -1
- data/spec/categorizer_spec.rb +24 -22
- data/spec/chunker_spec.rb +29 -28
- data/spec/model/chunker_spec.rb +12 -15
- data/spec/model/detokenizer_spec.rb +11 -14
- data/spec/model/named_entity_detector_spec.rb +11 -14
- data/spec/model/pos_tagger_spec.rb +12 -15
- data/spec/model/sentence_detector_spec.rb +11 -14
- data/spec/model/tokenizer_spec.rb +11 -14
- data/spec/named_entity_detector_spec.rb +28 -27
- data/spec/parser/parse_spec.rb +64 -56
- data/spec/parser_spec.rb +26 -21
- data/spec/pos_tagger_spec.rb +22 -23
- data/spec/sentence_detector_spec.rb +39 -30
- data/spec/spec_helper.rb +1 -1
- data/spec/tokenizer_spec.rb +26 -22
- metadata +16 -17
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
data/spec/parser/parse_spec.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Parser::Parse do
|
4
|
-
subject { OpenNlp::Parser::Parse }
|
3
|
+
RSpec.describe OpenNlp::Parser::Parse do
|
5
4
|
let(:text) { 'The red fox sleeps soundly .' }
|
6
|
-
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR,
|
7
|
-
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR,
|
5
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, 'en-parser-chunking.bin')) }
|
6
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
7
|
+
let(:parser) { OpenNlp::Parser.new(model, token_model) }
|
8
8
|
|
9
|
-
describe
|
10
|
-
it
|
9
|
+
describe 'initialization' do
|
10
|
+
it 'initializes a new parse object' do
|
11
11
|
j_parse = Java::opennlp.tools.parser.Parse.new(
|
12
12
|
text.to_java(:String),
|
13
13
|
Java::opennlp.tools.util.Span.new(0, text.size),
|
@@ -16,91 +16,99 @@ describe OpenNlp::Parser::Parse do
|
|
16
16
|
0.to_java(:Integer)
|
17
17
|
)
|
18
18
|
|
19
|
-
|
19
|
+
parse = described_class.new(j_parse)
|
20
|
+
expect(parse.j_instance).to be_a(described_class.java_class)
|
20
21
|
end
|
21
22
|
|
22
|
-
it
|
23
|
-
|
23
|
+
it 'raises an argument error when nil is passed as an argumenr' do
|
24
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
24
25
|
end
|
25
26
|
end
|
26
27
|
|
27
|
-
describe
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
28
|
+
describe '#tree_bank_string' do
|
29
|
+
let(:expected_tree_bank_str) do
|
30
|
+
'(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'returns proper string value for parsed text' do
|
34
|
+
tree_bank_string = parser.parse(text).tree_bank_string
|
35
|
+
expect(tree_bank_string).to eq(expected_tree_bank_str)
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
35
|
-
describe
|
36
|
-
|
37
|
-
|
38
|
-
parser.parse(text).code_tree.should == [
|
39
|
+
describe '#code_tree' do
|
40
|
+
let(:expected_code_tree) do
|
41
|
+
[
|
39
42
|
{
|
40
|
-
:type =>
|
41
|
-
:parent_type =>
|
42
|
-
:token =>
|
43
|
+
:type => 'S',
|
44
|
+
:parent_type => 'TOP',
|
45
|
+
:token => 'The red fox sleeps soundly .',
|
43
46
|
:children => [
|
44
47
|
{
|
45
|
-
:type =>
|
46
|
-
:parent_type =>
|
47
|
-
:token =>
|
48
|
+
:type => 'NP',
|
49
|
+
:parent_type => 'S',
|
50
|
+
:token => 'The red fox',
|
48
51
|
:children => [
|
49
52
|
{
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
:type => 'DT',
|
54
|
+
:parent_type => 'NP',
|
55
|
+
:token => 'The',
|
56
|
+
:children => [{:type => 'TK', :parent_type => 'DT', :token => 'The'}]
|
54
57
|
},
|
55
58
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
59
|
+
:type => 'JJ',
|
60
|
+
:parent_type => 'NP',
|
61
|
+
:token => 'red',
|
62
|
+
:children => [{:type => 'TK', :parent_type => 'JJ', :token => 'red'}]
|
60
63
|
},
|
61
64
|
{
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
65
|
+
:type => 'NN',
|
66
|
+
:parent_type => 'NP',
|
67
|
+
:token => 'fox',
|
68
|
+
:children => [{:type => 'TK', :parent_type => 'NN', :token => 'fox'}]
|
66
69
|
}
|
67
70
|
]
|
68
71
|
},
|
69
72
|
{
|
70
|
-
:type =>
|
71
|
-
:parent_type =>
|
72
|
-
:token =>
|
73
|
+
:type => 'VP',
|
74
|
+
:parent_type => 'S',
|
75
|
+
:token => 'sleeps soundly',
|
73
76
|
:children => [
|
74
77
|
{
|
75
|
-
:type =>
|
76
|
-
:parent_type =>
|
77
|
-
:token =>
|
78
|
-
:children => [{:type =>
|
78
|
+
:type => 'VBZ',
|
79
|
+
:parent_type => 'VP',
|
80
|
+
:token => 'sleeps',
|
81
|
+
:children => [{:type => 'TK', :parent_type => 'VBZ', :token => 'sleeps'}]
|
79
82
|
},
|
80
83
|
{
|
81
|
-
:type =>
|
82
|
-
:parent_type =>
|
83
|
-
:token =>
|
84
|
+
:type => 'ADVP',
|
85
|
+
:parent_type => 'VP',
|
86
|
+
:token => 'soundly',
|
84
87
|
:children => [
|
85
88
|
{
|
86
|
-
:type =>
|
87
|
-
:parent_type =>
|
88
|
-
:token =>
|
89
|
-
:children => [{:type =>
|
89
|
+
:type => 'RB',
|
90
|
+
:parent_type => 'ADVP',
|
91
|
+
:token => 'soundly',
|
92
|
+
:children => [{:type => 'TK', :parent_type => 'RB', :token => 'soundly'}]
|
90
93
|
}
|
91
94
|
]
|
92
95
|
}
|
93
96
|
]
|
94
97
|
},
|
95
98
|
{
|
96
|
-
:type =>
|
97
|
-
:parent_type =>
|
98
|
-
:token =>
|
99
|
-
:children => [{:type =>
|
99
|
+
:type => '.',
|
100
|
+
:parent_type => 'S',
|
101
|
+
:token => '.',
|
102
|
+
:children => [{:type => 'TK', :parent_type => '.', :token => '.'}]
|
100
103
|
}
|
101
104
|
]
|
102
105
|
}
|
103
106
|
]
|
104
107
|
end
|
108
|
+
|
109
|
+
it 'returns proper structure for parsed text' do
|
110
|
+
code_tree = parser.parse(text).code_tree
|
111
|
+
expect(code_tree).to eq(expected_code_tree)
|
112
|
+
end
|
105
113
|
end
|
106
|
-
end
|
114
|
+
end
|
data/spec/parser_spec.rb
CHANGED
@@ -1,39 +1,44 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Parser do
|
4
|
-
subject { OpenNlp::Parser }
|
3
|
+
RSpec.describe OpenNlp::Parser do
|
5
4
|
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
6
5
|
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
6
|
+
let(:parser) { described_class.new(model, token_model) }
|
7
7
|
|
8
|
-
describe
|
9
|
-
it
|
10
|
-
parser
|
11
|
-
parser.should be_a(subject)
|
8
|
+
describe 'initialization' do
|
9
|
+
it 'initializes a new parser' do
|
10
|
+
expect(parser.j_instance).to be_a(Java::opennlp.tools.parser.chunking.Parser)
|
12
11
|
end
|
13
12
|
|
14
|
-
it
|
15
|
-
|
13
|
+
it 'raises an argument error when no model specified' do
|
14
|
+
expect { described_class.new(nil, nil) }.to raise_error(ArgumentError)
|
16
15
|
end
|
17
16
|
|
18
|
-
it
|
19
|
-
|
17
|
+
it 'raises an argument error when no token_model is specified' do
|
18
|
+
expect { described_class.new(model, nil) }.to raise_error(ArgumentError)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
|
23
|
-
describe
|
24
|
-
|
22
|
+
describe '#parse' do
|
23
|
+
it 'parses an empty string' do
|
24
|
+
expect(parser.parse('')).to eq({})
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'parses a sentence' do
|
28
|
+
res = parser.parse('The red fox sleeps soundly .')
|
29
|
+
expect(res).to be_a(OpenNlp::Parser::Parse)
|
30
|
+
end
|
25
31
|
|
26
|
-
it
|
27
|
-
parser.parse(
|
32
|
+
it 'raises an error when nil is passed as an argument' do
|
33
|
+
expect { parser.parse(nil) }.to raise_error(ArgumentError)
|
28
34
|
end
|
29
35
|
|
30
|
-
it
|
31
|
-
|
32
|
-
res.class.should == OpenNlp::Parser::Parse
|
36
|
+
it 'raises an error when fixnum is passed as an argument' do
|
37
|
+
expect { parser.parse(111) }.to raise_error(ArgumentError)
|
33
38
|
end
|
34
39
|
|
35
|
-
it
|
36
|
-
|
40
|
+
it 'raises an error when array is passed as an argument' do
|
41
|
+
expect { parser.parse([1, 2]) }.to raise_error(ArgumentError)
|
37
42
|
end
|
38
43
|
end
|
39
|
-
end
|
44
|
+
end
|
data/spec/pos_tagger_spec.rb
CHANGED
@@ -1,37 +1,36 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::POSTagger do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::POSTagger do
|
4
|
+
let(:model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, 'en-pos-maxent.bin')) }
|
5
|
+
let(:pos_tagger) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize with a valid model" do
|
10
|
-
tagger = subject.new(model)
|
11
|
-
tagger.should be_a(subject)
|
12
|
-
tagger.j_instance.should be_a(subject.java_class)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initialize with a valid model' do
|
9
|
+
expect(pos_tagger.j_instance).to be_a(described_class.java_class)
|
13
10
|
end
|
14
11
|
|
15
|
-
it
|
16
|
-
|
12
|
+
it 'raises an ArgumentError without a valid model' do
|
13
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
describe
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
tagged = pos_tagger.tag("The quick brown fox jumps over the lazy dog.")
|
25
|
-
tagged.should == "The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN"
|
17
|
+
describe '#tag' do
|
18
|
+
it 'tags parts of a provided document' do
|
19
|
+
tagged = pos_tagger.tag('The quick brown fox jumps over the lazy dog.')
|
20
|
+
expect(tagged).to eq('The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN')
|
26
21
|
end
|
27
22
|
|
28
|
-
it
|
23
|
+
it 'tags provided tokens' do
|
29
24
|
tagged = pos_tagger.tag(%w(The quick brown fox jumps over the lazy dog .))
|
30
|
-
tagged.to_ary.
|
25
|
+
expect(tagged.to_ary).to eq(%w(DT JJ JJ NN NNS IN DT JJ NN .))
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
29
|
+
expect { pos_tagger.tag(nil) }.to raise_error(ArgumentError)
|
31
30
|
end
|
32
31
|
|
33
|
-
it
|
34
|
-
|
32
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
33
|
+
expect { pos_tagger.tag(111) }.to raise_error(ArgumentError)
|
35
34
|
end
|
36
35
|
end
|
37
|
-
end
|
36
|
+
end
|
@@ -1,50 +1,59 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::SentenceDetector do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::SentenceDetector do
|
4
|
+
let(:model) { OpenNlp::Model::SentenceDetector.new(File.join(FIXTURES_DIR, 'en-sent.bin')) }
|
5
|
+
let(:sentence_detector) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize with a valid model" do
|
10
|
-
sent_detector = subject.new(model)
|
11
|
-
sent_detector.should be_a(subject)
|
12
|
-
sent_detector.j_instance.should be_a(subject.java_class)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initializes with a valid model' do
|
9
|
+
expect(sentence_detector.j_instance).to be_a(described_class.java_class)
|
13
10
|
end
|
14
11
|
|
15
|
-
it
|
16
|
-
|
12
|
+
it 'raises an ArgumentError without a valid model' do
|
13
|
+
expect { subject.new(nil) }.to raise_error(ArgumentError)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
describe
|
21
|
-
|
17
|
+
describe '#detect' do
|
18
|
+
it 'detects no sentences in an empty string' do
|
19
|
+
sentences = sentence_detector.detect('')
|
20
|
+
expect(sentences).to eq([])
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'detects sentences in a string' do
|
24
|
+
sentences = sentence_detector.detect('The sky is blue. The Grass is green.')
|
25
|
+
expect(sentences).to eq(['The sky is blue.', 'The Grass is green.'])
|
26
|
+
end
|
22
27
|
|
23
|
-
it
|
24
|
-
|
25
|
-
sentences.should == []
|
28
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
29
|
+
expect { sentence_detector.detect(nil) }.to raise_error(ArgumentError)
|
26
30
|
end
|
27
31
|
|
28
|
-
it
|
29
|
-
|
30
|
-
sentences.should == ["The sky is blue.", "The Grass is green."]
|
32
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
33
|
+
expect { sentence_detector.detect(111) }.to raise_error(ArgumentError)
|
31
34
|
end
|
32
35
|
|
33
|
-
it
|
34
|
-
|
36
|
+
it 'raises an ArgumentError when array is passed as an argument' do
|
37
|
+
expect { sentence_detector.detect([1, 2]) }.to raise_error(ArgumentError)
|
35
38
|
end
|
36
39
|
end
|
37
40
|
|
38
|
-
describe
|
39
|
-
|
41
|
+
describe '#pos_detect' do
|
42
|
+
it 'detects sentences in a string' do
|
43
|
+
sentences = sentence_detector.pos_detect('The sky is blue. The Grass is green.')
|
44
|
+
expect(sentences).to eq([OpenNlp::Util::Span.new(0, 16), OpenNlp::Util::Span.new(17, 36)])
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
48
|
+
expect { sentence_detector.pos_detect(nil) }.to raise_error(ArgumentError)
|
49
|
+
end
|
40
50
|
|
41
|
-
it
|
42
|
-
|
43
|
-
sentences.should == [OpenNlp::Util::Span.new(0, 16), OpenNlp::Util::Span.new(17, 36)]
|
51
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
52
|
+
expect { sentence_detector.pos_detect(111) }.to raise_error(ArgumentError)
|
44
53
|
end
|
45
54
|
|
46
|
-
it
|
47
|
-
expect {
|
55
|
+
it 'raises an ArgumentError when array is passed as an argument' do
|
56
|
+
expect { sentence_detector.pos_detect([1, 2]) }.to raise_error(ArgumentError)
|
48
57
|
end
|
49
58
|
end
|
50
|
-
end
|
59
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -1,36 +1,40 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Tokenizer do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::Tokenizer do
|
4
|
+
let(:model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
5
|
+
let(:tokenizer) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize a new tokenizer" do
|
10
|
-
tokenizer = subject.new(model)
|
11
|
-
tokenizer.should be_a(subject)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initialize a new tokenizer' do
|
9
|
+
expect(tokenizer.j_instance).to be_a(described_class.java_class)
|
12
10
|
end
|
13
11
|
|
14
|
-
it
|
15
|
-
|
12
|
+
it 'raises an argument error when no model is specified' do
|
13
|
+
expect { subject.new(nil) }.to raise_error(ArgumentError)
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
|
-
describe
|
20
|
-
|
17
|
+
describe 'tokenize a string' do
|
18
|
+
it 'tokenizes an empty string' do
|
19
|
+
tokens = tokenizer.tokenize('')
|
20
|
+
expect(tokens).to eq([])
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'tokenizes a sentence' do
|
24
|
+
tokens = tokenizer.tokenize('The red fox sleeps soundly.')
|
25
|
+
expect(tokens).to eq(['The', 'red', 'fox', 'sleeps', 'soundly', '.'])
|
26
|
+
end
|
21
27
|
|
22
|
-
it
|
23
|
-
|
24
|
-
tokens.should == []
|
28
|
+
it 'raises an error when nil is passed as an argument' do
|
29
|
+
expect { tokenizer.tokenize(nil) }.to raise_error(ArgumentError)
|
25
30
|
end
|
26
31
|
|
27
|
-
it
|
28
|
-
|
29
|
-
tokens.should == ["The", "red", "fox", "sleeps", "soundly", "."]
|
32
|
+
it 'raises an error when fixnum is passed as an argument' do
|
33
|
+
expect { tokenizer.tokenize(111) }.to raise_error(ArgumentError)
|
30
34
|
end
|
31
35
|
|
32
|
-
it
|
33
|
-
|
36
|
+
it 'raises an error when array is passed as an argument' do
|
37
|
+
expect { tokenizer.tokenize([1, 2]) }.to raise_error(ArgumentError)
|
34
38
|
end
|
35
39
|
end
|
36
|
-
end
|
40
|
+
end
|