open_nlp 0.0.7-java → 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -2
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +31 -0
- data/README.md +8 -1
- data/lib/open_nlp.rb +3 -3
- data/lib/open_nlp/categorizer.rb +7 -3
- data/lib/open_nlp/chunker.rb +19 -8
- data/lib/open_nlp/model.rb +13 -9
- data/lib/open_nlp/named_entity_detector.rb +6 -2
- data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
- data/lib/open_nlp/parser.rb +43 -33
- data/lib/open_nlp/parser/parse.rb +12 -21
- data/lib/open_nlp/pos_tagger.rb +5 -2
- data/lib/open_nlp/sentence_detector.rb +16 -6
- data/lib/open_nlp/tokenizer.rb +8 -3
- data/lib/open_nlp/tool.rb +1 -1
- data/lib/open_nlp/util.rb +1 -2
- data/lib/open_nlp/util/span.rb +5 -5
- data/lib/open_nlp/version.rb +1 -1
- data/spec/categorizer_spec.rb +24 -22
- data/spec/chunker_spec.rb +29 -28
- data/spec/model/chunker_spec.rb +12 -15
- data/spec/model/detokenizer_spec.rb +11 -14
- data/spec/model/named_entity_detector_spec.rb +11 -14
- data/spec/model/pos_tagger_spec.rb +12 -15
- data/spec/model/sentence_detector_spec.rb +11 -14
- data/spec/model/tokenizer_spec.rb +11 -14
- data/spec/named_entity_detector_spec.rb +28 -27
- data/spec/parser/parse_spec.rb +64 -56
- data/spec/parser_spec.rb +26 -21
- data/spec/pos_tagger_spec.rb +22 -23
- data/spec/sentence_detector_spec.rb +39 -30
- data/spec/spec_helper.rb +1 -1
- data/spec/tokenizer_spec.rb +26 -22
- metadata +16 -17
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
data/spec/parser/parse_spec.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Parser::Parse do
|
4
|
-
subject { OpenNlp::Parser::Parse }
|
3
|
+
RSpec.describe OpenNlp::Parser::Parse do
|
5
4
|
let(:text) { 'The red fox sleeps soundly .' }
|
6
|
-
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR,
|
7
|
-
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR,
|
5
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, 'en-parser-chunking.bin')) }
|
6
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
7
|
+
let(:parser) { OpenNlp::Parser.new(model, token_model) }
|
8
8
|
|
9
|
-
describe
|
10
|
-
it
|
9
|
+
describe 'initialization' do
|
10
|
+
it 'initializes a new parse object' do
|
11
11
|
j_parse = Java::opennlp.tools.parser.Parse.new(
|
12
12
|
text.to_java(:String),
|
13
13
|
Java::opennlp.tools.util.Span.new(0, text.size),
|
@@ -16,91 +16,99 @@ describe OpenNlp::Parser::Parse do
|
|
16
16
|
0.to_java(:Integer)
|
17
17
|
)
|
18
18
|
|
19
|
-
|
19
|
+
parse = described_class.new(j_parse)
|
20
|
+
expect(parse.j_instance).to be_a(described_class.java_class)
|
20
21
|
end
|
21
22
|
|
22
|
-
it
|
23
|
-
|
23
|
+
it 'raises an argument error when nil is passed as an argumenr' do
|
24
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
24
25
|
end
|
25
26
|
end
|
26
27
|
|
27
|
-
describe
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
28
|
+
describe '#tree_bank_string' do
|
29
|
+
let(:expected_tree_bank_str) do
|
30
|
+
'(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'returns proper string value for parsed text' do
|
34
|
+
tree_bank_string = parser.parse(text).tree_bank_string
|
35
|
+
expect(tree_bank_string).to eq(expected_tree_bank_str)
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
35
|
-
describe
|
36
|
-
|
37
|
-
|
38
|
-
parser.parse(text).code_tree.should == [
|
39
|
+
describe '#code_tree' do
|
40
|
+
let(:expected_code_tree) do
|
41
|
+
[
|
39
42
|
{
|
40
|
-
:type =>
|
41
|
-
:parent_type =>
|
42
|
-
:token =>
|
43
|
+
:type => 'S',
|
44
|
+
:parent_type => 'TOP',
|
45
|
+
:token => 'The red fox sleeps soundly .',
|
43
46
|
:children => [
|
44
47
|
{
|
45
|
-
:type =>
|
46
|
-
:parent_type =>
|
47
|
-
:token =>
|
48
|
+
:type => 'NP',
|
49
|
+
:parent_type => 'S',
|
50
|
+
:token => 'The red fox',
|
48
51
|
:children => [
|
49
52
|
{
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
53
|
+
:type => 'DT',
|
54
|
+
:parent_type => 'NP',
|
55
|
+
:token => 'The',
|
56
|
+
:children => [{:type => 'TK', :parent_type => 'DT', :token => 'The'}]
|
54
57
|
},
|
55
58
|
{
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
59
|
+
:type => 'JJ',
|
60
|
+
:parent_type => 'NP',
|
61
|
+
:token => 'red',
|
62
|
+
:children => [{:type => 'TK', :parent_type => 'JJ', :token => 'red'}]
|
60
63
|
},
|
61
64
|
{
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
65
|
+
:type => 'NN',
|
66
|
+
:parent_type => 'NP',
|
67
|
+
:token => 'fox',
|
68
|
+
:children => [{:type => 'TK', :parent_type => 'NN', :token => 'fox'}]
|
66
69
|
}
|
67
70
|
]
|
68
71
|
},
|
69
72
|
{
|
70
|
-
:type =>
|
71
|
-
:parent_type =>
|
72
|
-
:token =>
|
73
|
+
:type => 'VP',
|
74
|
+
:parent_type => 'S',
|
75
|
+
:token => 'sleeps soundly',
|
73
76
|
:children => [
|
74
77
|
{
|
75
|
-
:type =>
|
76
|
-
:parent_type =>
|
77
|
-
:token =>
|
78
|
-
:children => [{:type =>
|
78
|
+
:type => 'VBZ',
|
79
|
+
:parent_type => 'VP',
|
80
|
+
:token => 'sleeps',
|
81
|
+
:children => [{:type => 'TK', :parent_type => 'VBZ', :token => 'sleeps'}]
|
79
82
|
},
|
80
83
|
{
|
81
|
-
:type =>
|
82
|
-
:parent_type =>
|
83
|
-
:token =>
|
84
|
+
:type => 'ADVP',
|
85
|
+
:parent_type => 'VP',
|
86
|
+
:token => 'soundly',
|
84
87
|
:children => [
|
85
88
|
{
|
86
|
-
:type =>
|
87
|
-
:parent_type =>
|
88
|
-
:token =>
|
89
|
-
:children => [{:type =>
|
89
|
+
:type => 'RB',
|
90
|
+
:parent_type => 'ADVP',
|
91
|
+
:token => 'soundly',
|
92
|
+
:children => [{:type => 'TK', :parent_type => 'RB', :token => 'soundly'}]
|
90
93
|
}
|
91
94
|
]
|
92
95
|
}
|
93
96
|
]
|
94
97
|
},
|
95
98
|
{
|
96
|
-
:type =>
|
97
|
-
:parent_type =>
|
98
|
-
:token =>
|
99
|
-
:children => [{:type =>
|
99
|
+
:type => '.',
|
100
|
+
:parent_type => 'S',
|
101
|
+
:token => '.',
|
102
|
+
:children => [{:type => 'TK', :parent_type => '.', :token => '.'}]
|
100
103
|
}
|
101
104
|
]
|
102
105
|
}
|
103
106
|
]
|
104
107
|
end
|
108
|
+
|
109
|
+
it 'returns proper structure for parsed text' do
|
110
|
+
code_tree = parser.parse(text).code_tree
|
111
|
+
expect(code_tree).to eq(expected_code_tree)
|
112
|
+
end
|
105
113
|
end
|
106
|
-
end
|
114
|
+
end
|
data/spec/parser_spec.rb
CHANGED
@@ -1,39 +1,44 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Parser do
|
4
|
-
subject { OpenNlp::Parser }
|
3
|
+
RSpec.describe OpenNlp::Parser do
|
5
4
|
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
6
5
|
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
6
|
+
let(:parser) { described_class.new(model, token_model) }
|
7
7
|
|
8
|
-
describe
|
9
|
-
it
|
10
|
-
parser
|
11
|
-
parser.should be_a(subject)
|
8
|
+
describe 'initialization' do
|
9
|
+
it 'initializes a new parser' do
|
10
|
+
expect(parser.j_instance).to be_a(Java::opennlp.tools.parser.chunking.Parser)
|
12
11
|
end
|
13
12
|
|
14
|
-
it
|
15
|
-
|
13
|
+
it 'raises an argument error when no model specified' do
|
14
|
+
expect { described_class.new(nil, nil) }.to raise_error(ArgumentError)
|
16
15
|
end
|
17
16
|
|
18
|
-
it
|
19
|
-
|
17
|
+
it 'raises an argument error when no token_model is specified' do
|
18
|
+
expect { described_class.new(model, nil) }.to raise_error(ArgumentError)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
|
23
|
-
describe
|
24
|
-
|
22
|
+
describe '#parse' do
|
23
|
+
it 'parses an empty string' do
|
24
|
+
expect(parser.parse('')).to eq({})
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'parses a sentence' do
|
28
|
+
res = parser.parse('The red fox sleeps soundly .')
|
29
|
+
expect(res).to be_a(OpenNlp::Parser::Parse)
|
30
|
+
end
|
25
31
|
|
26
|
-
it
|
27
|
-
parser.parse(
|
32
|
+
it 'raises an error when nil is passed as an argument' do
|
33
|
+
expect { parser.parse(nil) }.to raise_error(ArgumentError)
|
28
34
|
end
|
29
35
|
|
30
|
-
it
|
31
|
-
|
32
|
-
res.class.should == OpenNlp::Parser::Parse
|
36
|
+
it 'raises an error when fixnum is passed as an argument' do
|
37
|
+
expect { parser.parse(111) }.to raise_error(ArgumentError)
|
33
38
|
end
|
34
39
|
|
35
|
-
it
|
36
|
-
|
40
|
+
it 'raises an error when array is passed as an argument' do
|
41
|
+
expect { parser.parse([1, 2]) }.to raise_error(ArgumentError)
|
37
42
|
end
|
38
43
|
end
|
39
|
-
end
|
44
|
+
end
|
data/spec/pos_tagger_spec.rb
CHANGED
@@ -1,37 +1,36 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::POSTagger do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::POSTagger do
|
4
|
+
let(:model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, 'en-pos-maxent.bin')) }
|
5
|
+
let(:pos_tagger) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize with a valid model" do
|
10
|
-
tagger = subject.new(model)
|
11
|
-
tagger.should be_a(subject)
|
12
|
-
tagger.j_instance.should be_a(subject.java_class)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initialize with a valid model' do
|
9
|
+
expect(pos_tagger.j_instance).to be_a(described_class.java_class)
|
13
10
|
end
|
14
11
|
|
15
|
-
it
|
16
|
-
|
12
|
+
it 'raises an ArgumentError without a valid model' do
|
13
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
describe
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
tagged = pos_tagger.tag("The quick brown fox jumps over the lazy dog.")
|
25
|
-
tagged.should == "The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN"
|
17
|
+
describe '#tag' do
|
18
|
+
it 'tags parts of a provided document' do
|
19
|
+
tagged = pos_tagger.tag('The quick brown fox jumps over the lazy dog.')
|
20
|
+
expect(tagged).to eq('The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN')
|
26
21
|
end
|
27
22
|
|
28
|
-
it
|
23
|
+
it 'tags provided tokens' do
|
29
24
|
tagged = pos_tagger.tag(%w(The quick brown fox jumps over the lazy dog .))
|
30
|
-
tagged.to_ary.
|
25
|
+
expect(tagged.to_ary).to eq(%w(DT JJ JJ NN NNS IN DT JJ NN .))
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
29
|
+
expect { pos_tagger.tag(nil) }.to raise_error(ArgumentError)
|
31
30
|
end
|
32
31
|
|
33
|
-
it
|
34
|
-
|
32
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
33
|
+
expect { pos_tagger.tag(111) }.to raise_error(ArgumentError)
|
35
34
|
end
|
36
35
|
end
|
37
|
-
end
|
36
|
+
end
|
@@ -1,50 +1,59 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::SentenceDetector do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::SentenceDetector do
|
4
|
+
let(:model) { OpenNlp::Model::SentenceDetector.new(File.join(FIXTURES_DIR, 'en-sent.bin')) }
|
5
|
+
let(:sentence_detector) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize with a valid model" do
|
10
|
-
sent_detector = subject.new(model)
|
11
|
-
sent_detector.should be_a(subject)
|
12
|
-
sent_detector.j_instance.should be_a(subject.java_class)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initializes with a valid model' do
|
9
|
+
expect(sentence_detector.j_instance).to be_a(described_class.java_class)
|
13
10
|
end
|
14
11
|
|
15
|
-
it
|
16
|
-
|
12
|
+
it 'raises an ArgumentError without a valid model' do
|
13
|
+
expect { subject.new(nil) }.to raise_error(ArgumentError)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
describe
|
21
|
-
|
17
|
+
describe '#detect' do
|
18
|
+
it 'detects no sentences in an empty string' do
|
19
|
+
sentences = sentence_detector.detect('')
|
20
|
+
expect(sentences).to eq([])
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'detects sentences in a string' do
|
24
|
+
sentences = sentence_detector.detect('The sky is blue. The Grass is green.')
|
25
|
+
expect(sentences).to eq(['The sky is blue.', 'The Grass is green.'])
|
26
|
+
end
|
22
27
|
|
23
|
-
it
|
24
|
-
|
25
|
-
sentences.should == []
|
28
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
29
|
+
expect { sentence_detector.detect(nil) }.to raise_error(ArgumentError)
|
26
30
|
end
|
27
31
|
|
28
|
-
it
|
29
|
-
|
30
|
-
sentences.should == ["The sky is blue.", "The Grass is green."]
|
32
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
33
|
+
expect { sentence_detector.detect(111) }.to raise_error(ArgumentError)
|
31
34
|
end
|
32
35
|
|
33
|
-
it
|
34
|
-
|
36
|
+
it 'raises an ArgumentError when array is passed as an argument' do
|
37
|
+
expect { sentence_detector.detect([1, 2]) }.to raise_error(ArgumentError)
|
35
38
|
end
|
36
39
|
end
|
37
40
|
|
38
|
-
describe
|
39
|
-
|
41
|
+
describe '#pos_detect' do
|
42
|
+
it 'detects sentences in a string' do
|
43
|
+
sentences = sentence_detector.pos_detect('The sky is blue. The Grass is green.')
|
44
|
+
expect(sentences).to eq([OpenNlp::Util::Span.new(0, 16), OpenNlp::Util::Span.new(17, 36)])
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'raises an ArgumentError when nil is passed as an argument' do
|
48
|
+
expect { sentence_detector.pos_detect(nil) }.to raise_error(ArgumentError)
|
49
|
+
end
|
40
50
|
|
41
|
-
it
|
42
|
-
|
43
|
-
sentences.should == [OpenNlp::Util::Span.new(0, 16), OpenNlp::Util::Span.new(17, 36)]
|
51
|
+
it 'raises an ArgumentError when fixnum is passed as an argument' do
|
52
|
+
expect { sentence_detector.pos_detect(111) }.to raise_error(ArgumentError)
|
44
53
|
end
|
45
54
|
|
46
|
-
it
|
47
|
-
expect {
|
55
|
+
it 'raises an ArgumentError when array is passed as an argument' do
|
56
|
+
expect { sentence_detector.pos_detect([1, 2]) }.to raise_error(ArgumentError)
|
48
57
|
end
|
49
58
|
end
|
50
|
-
end
|
59
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -1,36 +1,40 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Tokenizer do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::Tokenizer do
|
4
|
+
let(:model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
5
|
+
let(:tokenizer) { described_class.new(model) }
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
it "should initialize a new tokenizer" do
|
10
|
-
tokenizer = subject.new(model)
|
11
|
-
tokenizer.should be_a(subject)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initialize a new tokenizer' do
|
9
|
+
expect(tokenizer.j_instance).to be_a(described_class.java_class)
|
12
10
|
end
|
13
11
|
|
14
|
-
it
|
15
|
-
|
12
|
+
it 'raises an argument error when no model is specified' do
|
13
|
+
expect { subject.new(nil) }.to raise_error(ArgumentError)
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
|
-
describe
|
20
|
-
|
17
|
+
describe 'tokenize a string' do
|
18
|
+
it 'tokenizes an empty string' do
|
19
|
+
tokens = tokenizer.tokenize('')
|
20
|
+
expect(tokens).to eq([])
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'tokenizes a sentence' do
|
24
|
+
tokens = tokenizer.tokenize('The red fox sleeps soundly.')
|
25
|
+
expect(tokens).to eq(['The', 'red', 'fox', 'sleeps', 'soundly', '.'])
|
26
|
+
end
|
21
27
|
|
22
|
-
it
|
23
|
-
|
24
|
-
tokens.should == []
|
28
|
+
it 'raises an error when nil is passed as an argument' do
|
29
|
+
expect { tokenizer.tokenize(nil) }.to raise_error(ArgumentError)
|
25
30
|
end
|
26
31
|
|
27
|
-
it
|
28
|
-
|
29
|
-
tokens.should == ["The", "red", "fox", "sleeps", "soundly", "."]
|
32
|
+
it 'raises an error when fixnum is passed as an argument' do
|
33
|
+
expect { tokenizer.tokenize(111) }.to raise_error(ArgumentError)
|
30
34
|
end
|
31
35
|
|
32
|
-
it
|
33
|
-
|
36
|
+
it 'raises an error when array is passed as an argument' do
|
37
|
+
expect { tokenizer.tokenize([1, 2]) }.to raise_error(ArgumentError)
|
34
38
|
end
|
35
39
|
end
|
36
|
-
end
|
40
|
+
end
|