open_nlp 0.0.7-java → 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -2
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +31 -0
- data/README.md +8 -1
- data/lib/open_nlp.rb +3 -3
- data/lib/open_nlp/categorizer.rb +7 -3
- data/lib/open_nlp/chunker.rb +19 -8
- data/lib/open_nlp/model.rb +13 -9
- data/lib/open_nlp/named_entity_detector.rb +6 -2
- data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
- data/lib/open_nlp/parser.rb +43 -33
- data/lib/open_nlp/parser/parse.rb +12 -21
- data/lib/open_nlp/pos_tagger.rb +5 -2
- data/lib/open_nlp/sentence_detector.rb +16 -6
- data/lib/open_nlp/tokenizer.rb +8 -3
- data/lib/open_nlp/tool.rb +1 -1
- data/lib/open_nlp/util.rb +1 -2
- data/lib/open_nlp/util/span.rb +5 -5
- data/lib/open_nlp/version.rb +1 -1
- data/spec/categorizer_spec.rb +24 -22
- data/spec/chunker_spec.rb +29 -28
- data/spec/model/chunker_spec.rb +12 -15
- data/spec/model/detokenizer_spec.rb +11 -14
- data/spec/model/named_entity_detector_spec.rb +11 -14
- data/spec/model/pos_tagger_spec.rb +12 -15
- data/spec/model/sentence_detector_spec.rb +11 -14
- data/spec/model/tokenizer_spec.rb +11 -14
- data/spec/named_entity_detector_spec.rb +28 -27
- data/spec/parser/parse_spec.rb +64 -56
- data/spec/parser_spec.rb +26 -21
- data/spec/pos_tagger_spec.rb +22 -23
- data/spec/sentence_detector_spec.rb +39 -30
- data/spec/spec_helper.rb +1 -1
- data/spec/tokenizer_spec.rb +26 -22
- metadata +16 -17
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
data/lib/open_nlp/tool.rb
CHANGED
@@ -5,7 +5,7 @@ module OpenNlp
|
|
5
5
|
attr_reader :j_instance
|
6
6
|
|
7
7
|
def initialize(model)
|
8
|
-
|
8
|
+
fail ArgumentError, 'model must be an OpenNlp::Model' unless model.is_a?(OpenNlp::Model)
|
9
9
|
@j_instance = self.class.java_class.new(model.j_model)
|
10
10
|
end
|
11
11
|
end
|
data/lib/open_nlp/util.rb
CHANGED
data/lib/open_nlp/util/span.rb
CHANGED
@@ -6,8 +6,8 @@ class OpenNlp::Util::Span
|
|
6
6
|
attr_reader :j_instance
|
7
7
|
|
8
8
|
def initialize(s, e)
|
9
|
-
|
10
|
-
|
9
|
+
fail ArgumentError, 's should be an integer' unless s.is_a?(Fixnum)
|
10
|
+
fail ArgumentError, 'e should be an integer' unless e.is_a?(Fixnum)
|
11
11
|
|
12
12
|
@j_instance = self.class.java_class.new(s, e)
|
13
13
|
end
|
@@ -31,8 +31,8 @@ class OpenNlp::Util::Span
|
|
31
31
|
def ==(obj)
|
32
32
|
return false unless obj.is_a?(self.class)
|
33
33
|
|
34
|
-
[:start, :end, :type].each_with_object(true) do |
|
35
|
-
|
34
|
+
[:start, :end, :type].each_with_object(true) do |method, acc|
|
35
|
+
acc = acc && self.public_send(method) == obj.public_send(method)
|
36
36
|
end
|
37
37
|
end
|
38
|
-
end
|
38
|
+
end
|
data/lib/open_nlp/version.rb
CHANGED
data/spec/categorizer_spec.rb
CHANGED
@@ -1,36 +1,38 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Categorizer do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
categorizer
|
10
|
-
categorizer.should be_a(subject)
|
11
|
-
categorizer.j_instance.should be_a(subject.java_class)
|
3
|
+
RSpec.describe OpenNlp::Categorizer do
|
4
|
+
let(:model) { OpenNlp::Model::Categorizer.new(File.join(FIXTURES_DIR, 'en-doccat.bin')) }
|
5
|
+
|
6
|
+
describe 'initialization' do
|
7
|
+
it 'is initialized with a valid model' do
|
8
|
+
categorizer = described_class.new(model)
|
9
|
+
expect(categorizer.j_instance).to be_a(described_class.java_class)
|
12
10
|
end
|
13
11
|
|
14
|
-
it
|
15
|
-
|
12
|
+
it 'raises an ArgumentError without a valid model' do
|
13
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
|
-
describe
|
20
|
-
let(:categorizer) {
|
17
|
+
describe '#categorize' do
|
18
|
+
let(:categorizer) { described_class.new(model) }
|
19
|
+
|
20
|
+
it 'categorizes a provided document to positive' do
|
21
|
+
category = categorizer.categorize('The fox is a good worker.')
|
22
|
+
expect(category).to eq('Positive')
|
23
|
+
end
|
21
24
|
|
22
|
-
it
|
23
|
-
category = categorizer.categorize(
|
24
|
-
category.
|
25
|
+
it 'categorizes a provided document to negative' do
|
26
|
+
category = categorizer.categorize('Quick brown fox jumps very bad.')
|
27
|
+
expect(category).to eq('Negative')
|
25
28
|
end
|
26
29
|
|
27
|
-
it
|
28
|
-
|
29
|
-
category.should == "Negative"
|
30
|
+
it 'raises an ArgumentError when nil is passed as a param' do
|
31
|
+
expect { categorizer.categorize(nil) }.to raise_error(ArgumentError)
|
30
32
|
end
|
31
33
|
|
32
|
-
it
|
33
|
-
|
34
|
+
it 'raises an ArgumentError when Fixnum is passed a param' do
|
35
|
+
expect { categorizer.categorize(123) }.to raise_error(ArgumentError)
|
34
36
|
end
|
35
37
|
end
|
36
|
-
end
|
38
|
+
end
|
data/spec/chunker_spec.rb
CHANGED
@@ -1,46 +1,47 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Chunker do
|
4
|
-
|
3
|
+
RSpec.describe OpenNlp::Chunker do
|
4
|
+
let(:model) { OpenNlp::Model::Chunker.new(File.join(FIXTURES_DIR, 'en-chunker.bin')) }
|
5
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
6
|
+
let(:pos_model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, 'en-pos-maxent.bin')) }
|
7
|
+
let(:chunker) { described_class.new(model, token_model, pos_model) }
|
5
8
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
describe "initialization" do
|
11
|
-
it "should initialize a new chunker" do
|
12
|
-
chunker = subject.new(model, token_model, pos_model)
|
13
|
-
chunker.should be_a(subject)
|
9
|
+
describe 'initialization' do
|
10
|
+
it 'initializes a new chunker' do
|
11
|
+
expect(chunker).to be_a(described_class)
|
14
12
|
end
|
15
13
|
|
16
|
-
it
|
17
|
-
|
14
|
+
it 'raises an argument error when no model is specified' do
|
15
|
+
expect { subject.new(nil, nil, nil) }.to raise_error(ArgumentError)
|
18
16
|
end
|
19
17
|
|
20
|
-
it
|
21
|
-
|
18
|
+
it 'raises an argument error when no token_model is specified' do
|
19
|
+
expect { subject.new(model, nil, nil) }.to raise_error(ArgumentError)
|
22
20
|
end
|
23
21
|
|
24
|
-
it
|
25
|
-
|
22
|
+
it 'raises an argument error when no pos_model is specified' do
|
23
|
+
expect { subject.new(model, token_model, nil) }.to raise_error(ArgumentError)
|
26
24
|
end
|
27
25
|
end
|
28
26
|
|
29
|
-
describe
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
chunks = chunker.chunk("")
|
34
|
-
chunks.should == []
|
27
|
+
describe 'chunking a string' do
|
28
|
+
it 'chunks an empty string' do
|
29
|
+
chunks = chunker.chunk('')
|
30
|
+
expect(chunks).to eq([])
|
35
31
|
end
|
36
32
|
|
37
|
-
it
|
38
|
-
chunks = chunker.chunk(
|
39
|
-
chunks.
|
33
|
+
it 'chunks a sentence' do
|
34
|
+
chunks = chunker.chunk('The red fox sleeps soundly.')
|
35
|
+
expect(chunks).to eq(
|
36
|
+
[
|
37
|
+
[{ 'The' => 'DT' }, { 'red' => 'JJ' }, { 'fox' => 'NN' }, { 'sleeps' => 'NNS' }],
|
38
|
+
[{ 'soundly' => 'RB' }]
|
39
|
+
]
|
40
|
+
)
|
40
41
|
end
|
41
42
|
|
42
|
-
it
|
43
|
-
|
43
|
+
it 'raises an error when not passed a string' do
|
44
|
+
expect { chunker.chunk(nil) }.to raise_error(ArgumentError)
|
44
45
|
end
|
45
46
|
end
|
46
47
|
end
|
data/spec/model/chunker_spec.rb
CHANGED
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Model::Chunker do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-chunker.bin") }
|
3
|
+
RSpec.describe OpenNlp::Model::Chunker do
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-chunker.bin') }
|
6
5
|
|
7
|
-
it
|
8
|
-
chunker_model =
|
9
|
-
chunker_model.
|
10
|
-
chunker_model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accepts a string filename parameter' do
|
7
|
+
chunker_model = described_class.new(model_file_name)
|
8
|
+
expect(chunker_model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'accepts a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
chunker_model =
|
16
|
-
chunker_model.
|
17
|
-
chunker_model.j_model.should be_a(subject.java_class)
|
13
|
+
chunker_model = described_class.new(file_input_stream)
|
14
|
+
expect(chunker_model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
|
-
end
|
20
|
+
end
|
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Model::Detokenizer do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-detokenizer.xml") }
|
3
|
+
RSpec.describe OpenNlp::Model::Detokenizer do
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-detokenizer.xml') }
|
6
5
|
|
7
|
-
it
|
8
|
-
model =
|
9
|
-
model.
|
10
|
-
model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accepts a string filename parameter' do
|
7
|
+
model = described_class.new(model_file_name)
|
8
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'accepts a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
model =
|
16
|
-
model.
|
17
|
-
model.j_model.should be_a(subject.java_class)
|
13
|
+
model = described_class.new(file_input_stream)
|
14
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
20
|
end
|
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Model::NamedEntityDetector do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-ner-time.bin") }
|
3
|
+
RSpec.describe OpenNlp::Model::NamedEntityDetector do
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-ner-time.bin') }
|
6
5
|
|
7
|
-
it
|
8
|
-
model =
|
9
|
-
model.
|
10
|
-
model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accepts a string filename parameter' do
|
7
|
+
model = described_class.new(model_file_name)
|
8
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'should accept a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
model =
|
16
|
-
model.
|
17
|
-
model.j_model.should be_a(subject.java_class)
|
13
|
+
model = described_class.new(file_input_stream)
|
14
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
20
|
end
|
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Model::POSTagger do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-pos-maxent.bin") }
|
3
|
+
RSpec.describe OpenNlp::Model::POSTagger do
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-pos-maxent.bin') }
|
6
5
|
|
7
|
-
it
|
8
|
-
model =
|
9
|
-
model.
|
10
|
-
model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accepts a string filename parameter' do
|
7
|
+
model = described_class.new(model_file_name)
|
8
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'accepts a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
model =
|
16
|
-
model.
|
17
|
-
model.j_model.should be_a(subject.java_class)
|
13
|
+
model = described_class.new(file_input_stream)
|
14
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
|
-
end
|
20
|
+
end
|
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::Model::SentenceDetector do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-sent.bin") }
|
3
|
+
RSpec.describe OpenNlp::Model::SentenceDetector do
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-sent.bin') }
|
6
5
|
|
7
|
-
it
|
8
|
-
model =
|
9
|
-
model.
|
10
|
-
model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accepts a string filename parameter' do
|
7
|
+
model = described_class.new(model_file_name)
|
8
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'accepts a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
model =
|
16
|
-
model.
|
17
|
-
model.j_model.should be_a(subject.java_class)
|
13
|
+
model = described_class.new(file_input_stream)
|
14
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
20
|
end
|
@@ -1,23 +1,20 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
3
|
describe OpenNlp::Model::Tokenizer do
|
4
|
-
|
5
|
-
let(:model_file_name) { File.join(FIXTURES_DIR, "en-token.bin") }
|
4
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, 'en-token.bin') }
|
6
5
|
|
7
|
-
it
|
8
|
-
model =
|
9
|
-
model.
|
10
|
-
model.j_model.should be_a(subject.java_class)
|
6
|
+
it 'accept a string filename parameter' do
|
7
|
+
model = described_class.new(model_file_name)
|
8
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
11
9
|
end
|
12
10
|
|
13
|
-
it
|
11
|
+
it 'should accept a java.io.FileInputStream object' do
|
14
12
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
-
model =
|
16
|
-
model.
|
17
|
-
model.j_model.should be_a(subject.java_class)
|
13
|
+
model = described_class.new(file_input_stream)
|
14
|
+
expect(model.j_model).to be_a(described_class.java_class)
|
18
15
|
end
|
19
16
|
|
20
|
-
it
|
21
|
-
|
17
|
+
it 'raises an argument error when nil is passed as a model' do
|
18
|
+
expect { described_class.new(nil) }.to raise_error(ArgumentError)
|
22
19
|
end
|
23
|
-
end
|
20
|
+
end
|
@@ -1,42 +1,43 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
describe OpenNlp::NamedEntityDetector do
|
4
|
-
|
5
|
-
|
6
|
-
let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, "en-ner-time.bin")) }
|
3
|
+
RSpec.describe OpenNlp::NamedEntityDetector do
|
4
|
+
let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, 'en-ner-time.bin')) }
|
5
|
+
let(:ne_detector) { described_class.new(model) }
|
7
6
|
|
8
|
-
describe
|
9
|
-
it
|
10
|
-
ne_detector
|
11
|
-
ne_detector.should be_a(subject)
|
7
|
+
describe 'initialization' do
|
8
|
+
it 'initializes with a valid model' do
|
9
|
+
expect(ne_detector.j_instance).to be_a(described_class.java_class)
|
12
10
|
end
|
13
11
|
|
14
|
-
it
|
15
|
-
|
12
|
+
it 'raises an ArgumentError otherwise' do
|
13
|
+
expect { subject.new(nil) }.to raise_error(ArgumentError)
|
16
14
|
end
|
17
15
|
end
|
18
16
|
|
19
|
-
describe
|
20
|
-
|
21
|
-
|
22
|
-
it "should detect nothing in an empty sentence" do
|
17
|
+
describe '#detect' do
|
18
|
+
it 'detects nothing for empty sentence' do
|
23
19
|
spans = ne_detector.detect([])
|
24
|
-
spans.
|
25
|
-
|
20
|
+
expect(spans).to eq([])
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'detects the named entities' do
|
24
|
+
spans = ne_detector.detect(['The', 'time', 'is', '10', ':', '23', 'am'])
|
25
|
+
expect(spans.size).to eq(1)
|
26
|
+
expect(spans.first).to be_a(Java::opennlp.tools.util.Span)
|
27
|
+
expect(spans.first.getStart).to eq(3)
|
28
|
+
expect(spans.first.getEnd).to eq(7)
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'raises an error if nil is passed as an argument' do
|
32
|
+
expect { ne_detector.detect(nil) }.to raise_error(ArgumentError)
|
26
33
|
end
|
27
34
|
|
28
|
-
it
|
29
|
-
|
30
|
-
spans.should be_a(Array)
|
31
|
-
spans[0].should be_a(Java::opennlp.tools.util.Span)
|
32
|
-
spans[0].getStart.should == 3
|
33
|
-
spans[0].getEnd.should == 7
|
35
|
+
it 'raises an error if string is passed as an argument' do
|
36
|
+
expect { ne_detector.detect('str') }.to raise_error(ArgumentError)
|
34
37
|
end
|
35
38
|
|
36
|
-
it
|
37
|
-
|
38
|
-
lambda { ne_detector.detect('str') }.should raise_error(ArgumentError)
|
39
|
-
lambda { ne_detector.detect(111) }.should raise_error(ArgumentError)
|
39
|
+
it 'raises an error if fixnum is passed as an argument' do
|
40
|
+
expect { ne_detector.detect(111) }.to raise_error(ArgumentError)
|
40
41
|
end
|
41
42
|
end
|
42
43
|
end
|