open_nlp 0.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/lib/open_nlp/chunker.rb +42 -0
- data/lib/open_nlp/model/chunker.rb +3 -0
- data/lib/open_nlp/model/detokenizer.rb +3 -0
- data/lib/open_nlp/model/named_entity_detector.rb +3 -0
- data/lib/open_nlp/model/pos_tagger.rb +3 -0
- data/lib/open_nlp/model/sentence_detector.rb +3 -0
- data/lib/open_nlp/model/tokenizer.rb +3 -0
- data/lib/open_nlp/model.rb +28 -0
- data/lib/open_nlp/named_entity_detector.rb +10 -0
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
- data/lib/open_nlp/pos_tagger.rb +10 -0
- data/lib/open_nlp/sentence_detector.rb +10 -0
- data/lib/open_nlp/tokenizer.rb +10 -0
- data/lib/open_nlp/tool.rb +20 -0
- data/lib/open_nlp/version.rb +3 -0
- data/lib/open_nlp.rb +20 -0
- data/open_nlp.gemspec +19 -0
- data/spec/chunker_spec.rb +38 -0
- data/spec/fixtures/en-chunker.bin +0 -0
- data/spec/fixtures/en-detokenizer.xml +107 -0
- data/spec/fixtures/en-ner-time.bin +0 -0
- data/spec/fixtures/en-pos-maxent.bin +0 -0
- data/spec/fixtures/en-sent.bin +0 -0
- data/spec/fixtures/en-token.bin +0 -0
- data/spec/model/chunker_spec.rb +23 -0
- data/spec/model/detokenizer_spec.rb +23 -0
- data/spec/model/named_entity_detector_spec.rb +23 -0
- data/spec/model/pos_tagger_spec.rb +23 -0
- data/spec/model/sentence_detector_spec.rb +23 -0
- data/spec/model/tokenizer_spec.rb +23 -0
- data/spec/named_entity_detector_spec.rb +42 -0
- data/spec/pos_tagger_spec.rb +37 -0
- data/spec/sentence_detector_spec.rb +37 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tokenizer_spec.rb +36 -0
- metadata +105 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Alexander Svirin
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# OpenNlp
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'open_nlp'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install open_nlp
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Chunker < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.chunker.ChunkerME
|
4
|
+
|
5
|
+
def initialize(model, token_model, pos_model)
|
6
|
+
#raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
|
7
|
+
super(model)
|
8
|
+
|
9
|
+
raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
|
10
|
+
raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger)
|
11
|
+
|
12
|
+
@tokenizer = Tokenizer.new(token_model)
|
13
|
+
@pos_tagger = POSTagger.new(pos_model)
|
14
|
+
#@j_instance = self.java_class.new(model.j_model)
|
15
|
+
end
|
16
|
+
|
17
|
+
def chunk(str)
|
18
|
+
raise ArgumentError, "str must be a String" unless str.is_a?(String)
|
19
|
+
|
20
|
+
tokens = @tokenizer.tokenize(str)
|
21
|
+
pos_tags = @pos_tagger.tag(tokens).to_ary
|
22
|
+
|
23
|
+
chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
|
24
|
+
|
25
|
+
build_chunks(chunks, tokens, pos_tags)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def build_chunks(chunks, tokens, pos_tags)
|
30
|
+
# data[i] = [token, pos_tag, chunk_val]
|
31
|
+
data = tokens.zip(pos_tags, chunks)
|
32
|
+
|
33
|
+
data.inject([]) do |acc, val|
|
34
|
+
chunk = val[2]
|
35
|
+
acc << [val[0]] if chunk[0] == 'B'
|
36
|
+
acc.last << val[0] if chunk[0] == 'I'
|
37
|
+
|
38
|
+
acc
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Model
|
3
|
+
attr_reader :j_model
|
4
|
+
|
5
|
+
def initialize(model)
|
6
|
+
model_stream = case model
|
7
|
+
when java.io.FileInputStream
|
8
|
+
model
|
9
|
+
when String
|
10
|
+
java.io.FileInputStream.new(model)
|
11
|
+
else
|
12
|
+
raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
|
13
|
+
end
|
14
|
+
|
15
|
+
@j_model = self.class.java_class_name.new(model_stream)
|
16
|
+
end
|
17
|
+
|
18
|
+
class << self
|
19
|
+
def java_class_name=(value)
|
20
|
+
@java_class = value
|
21
|
+
end
|
22
|
+
|
23
|
+
def java_class_name
|
24
|
+
@java_class
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class NamedEntityDetector < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.namefind.NameFinderME
|
4
|
+
|
5
|
+
def detect(tokens)
|
6
|
+
raise ArgumentError, "tokens must be an instance of Array" unless tokens.is_a?(Array)
|
7
|
+
@j_instance.find(tokens.to_java(:String)).to_ary
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class POSTagger < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.postag.POSTaggerME
|
4
|
+
|
5
|
+
def tag(tokens)
|
6
|
+
raise ArgumentError, "tokens must be an instance of String or Array" unless (tokens.is_a?(Array) || tokens.is_a?(String))
|
7
|
+
@j_instance.tag(tokens.to_java(:String))
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class SentenceDetector < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
|
4
|
+
|
5
|
+
def detect(string)
|
6
|
+
raise ArgumentError, "string must be a String" unless string.is_a?(String)
|
7
|
+
@j_instance.sentDetect(string).to_ary
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Tool
|
3
|
+
attr_reader :j_instance
|
4
|
+
|
5
|
+
def initialize(model)
|
6
|
+
raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
|
7
|
+
@j_instance = self.class.java_class.new(model.j_model)
|
8
|
+
end
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def java_class=(value)
|
12
|
+
@java_class = value
|
13
|
+
end
|
14
|
+
|
15
|
+
def java_class
|
16
|
+
@java_class
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/open_nlp.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'open_nlp/version'
|
2
|
+
|
3
|
+
require 'java'
|
4
|
+
require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
|
5
|
+
require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
|
6
|
+
|
7
|
+
require 'open_nlp/model'
|
8
|
+
require 'open_nlp/model/chunker'
|
9
|
+
require 'open_nlp/model/detokenizer'
|
10
|
+
require 'open_nlp/model/named_entity_detector'
|
11
|
+
require 'open_nlp/model/pos_tagger'
|
12
|
+
require 'open_nlp/model/sentence_detector'
|
13
|
+
require 'open_nlp/model/tokenizer'
|
14
|
+
|
15
|
+
require 'open_nlp/tool'
|
16
|
+
require 'open_nlp/named_entity_detector'
|
17
|
+
require 'open_nlp/pos_tagger'
|
18
|
+
require 'open_nlp/sentence_detector'
|
19
|
+
require 'open_nlp/tokenizer'
|
20
|
+
require 'open_nlp/chunker'
|
data/open_nlp.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'open_nlp/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "open_nlp"
|
8
|
+
gem.version = OpenNlp::VERSION
|
9
|
+
gem.authors = ["Hck"]
|
10
|
+
gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
|
11
|
+
gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split($/)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
|
18
|
+
gem.platform = "java"
|
19
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Chunker do
|
4
|
+
subject { OpenNlp::Chunker }
|
5
|
+
|
6
|
+
let(:model) { OpenNlp::Model::Chunker.new(File.join(FIXTURES_DIR, "en-chunker.bin")) }
|
7
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
8
|
+
let(:pos_model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
|
9
|
+
|
10
|
+
describe "initialization" do
|
11
|
+
it "should initialize a new chunker" do
|
12
|
+
chunker = subject.new(model, token_model, pos_model)
|
13
|
+
chunker.should be_a(subject)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should raise an argument error when no model is supplied" do
|
17
|
+
lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "chunking a string" do
|
22
|
+
let(:chunker) { subject.new(model, token_model, pos_model) }
|
23
|
+
|
24
|
+
it "should chunk an empty string" do
|
25
|
+
chunks = chunker.chunk("")
|
26
|
+
chunks.should == []
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should chunk a sentence" do
|
30
|
+
chunks = chunker.chunk("The red fox sleeps soundly.")
|
31
|
+
chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should raise an error when not passed a string" do
|
35
|
+
lambda { chunker.chunk(nil) }.should raise_error(ArgumentError)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
Binary file
|
@@ -0,0 +1,107 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
|
3
|
+
<!--
|
4
|
+
Licensed to the Apache Software Foundation (ASF) under one
|
5
|
+
or more contributor license agreements. See the NOTICE file
|
6
|
+
distributed with this work for additional information
|
7
|
+
regarding copyright ownership. The ASF licenses this file
|
8
|
+
to you under the Apache License, Version 2.0 (the
|
9
|
+
"License"); you may not use this file except in compliance
|
10
|
+
with the License. You may obtain a copy of the License at
|
11
|
+
|
12
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
|
14
|
+
Unless required by applicable law or agreed to in writing,
|
15
|
+
software distributed under the License is distributed on an
|
16
|
+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
17
|
+
KIND, either express or implied. See the License for the
|
18
|
+
specific language governing permissions and limitations
|
19
|
+
under the License.
|
20
|
+
-->
|
21
|
+
|
22
|
+
<dictionary>
|
23
|
+
<entry operation="RIGHT_LEFT_MATCHING">
|
24
|
+
<token>"</token>
|
25
|
+
</entry>
|
26
|
+
<entry operation="RIGHT_LEFT_MATCHING">
|
27
|
+
<token>'</token>
|
28
|
+
</entry>
|
29
|
+
<entry operation="MOVE_LEFT">
|
30
|
+
<token>.</token>
|
31
|
+
</entry>
|
32
|
+
<entry operation="MOVE_LEFT">
|
33
|
+
<token>?</token>
|
34
|
+
</entry>
|
35
|
+
<entry operation="MOVE_LEFT">
|
36
|
+
<token>!</token>
|
37
|
+
</entry>
|
38
|
+
<entry operation="MOVE_LEFT">
|
39
|
+
<token>,</token>
|
40
|
+
</entry>
|
41
|
+
<entry operation="MOVE_LEFT">
|
42
|
+
<token>;</token>
|
43
|
+
</entry>
|
44
|
+
<entry operation="MOVE_LEFT">
|
45
|
+
<token>:</token>
|
46
|
+
</entry>
|
47
|
+
<entry operation="MOVE_RIGHT">
|
48
|
+
<token>(</token>
|
49
|
+
</entry>
|
50
|
+
<entry operation="MOVE_LEFT">
|
51
|
+
<token>)</token>
|
52
|
+
</entry>
|
53
|
+
<entry operation="MOVE_LEFT">
|
54
|
+
<token>}</token>
|
55
|
+
</entry>
|
56
|
+
<entry operation="MOVE_RIGHT">
|
57
|
+
<token>{</token>
|
58
|
+
</entry>
|
59
|
+
<entry operation="MOVE_LEFT">
|
60
|
+
<token>]</token>
|
61
|
+
</entry>
|
62
|
+
<entry operation="MOVE_RIGHT">
|
63
|
+
<token>[</token>
|
64
|
+
</entry>
|
65
|
+
<entry operation="MOVE_RIGHT">
|
66
|
+
<token>``</token>
|
67
|
+
</entry>
|
68
|
+
<entry operation="MOVE_LEFT">
|
69
|
+
<token>''</token>
|
70
|
+
</entry>
|
71
|
+
<entry operation="MOVE_LEFT">
|
72
|
+
<token>%</token>
|
73
|
+
</entry>
|
74
|
+
<entry operation="MOVE_LEFT">
|
75
|
+
<token>n't</token>
|
76
|
+
</entry>
|
77
|
+
<entry operation="MOVE_LEFT">
|
78
|
+
<token>'ve</token>
|
79
|
+
</entry>
|
80
|
+
<entry operation="MOVE_LEFT">
|
81
|
+
<token>'d</token>
|
82
|
+
</entry>
|
83
|
+
<entry operation="MOVE_LEFT">
|
84
|
+
<token>'ll</token>
|
85
|
+
</entry>
|
86
|
+
<entry operation="MOVE_LEFT">
|
87
|
+
<token>'s</token>
|
88
|
+
</entry>
|
89
|
+
<entry operation="MOVE_LEFT">
|
90
|
+
<token>'re</token>
|
91
|
+
</entry>
|
92
|
+
<entry operation="MOVE_LEFT">
|
93
|
+
<token>'m</token>
|
94
|
+
</entry>
|
95
|
+
<entry operation="MOVE_LEFT">
|
96
|
+
<token>.org</token>
|
97
|
+
</entry>
|
98
|
+
<entry operation="MOVE_LEFT">
|
99
|
+
<token>.com</token>
|
100
|
+
</entry>
|
101
|
+
<entry operation="MOVE_LEFT">
|
102
|
+
<token>.net</token>
|
103
|
+
</entry>
|
104
|
+
<entry operation="MOVE_RIGHT">
|
105
|
+
<token>#</token>
|
106
|
+
</entry>
|
107
|
+
</dictionary>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::Chunker do
|
4
|
+
subject { OpenNlp::Model::Chunker }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-chunker.bin") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
chunker_model = subject.new(model_file_name)
|
9
|
+
chunker_model.should be_a(subject)
|
10
|
+
chunker_model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
chunker_model = subject.new(file_input_stream)
|
16
|
+
chunker_model.should be_a(subject)
|
17
|
+
chunker_model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::Detokenizer do
|
4
|
+
subject { OpenNlp::Model::Detokenizer }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-detokenizer.xml") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
model = subject.new(model_file_name)
|
9
|
+
model.should be_a(subject)
|
10
|
+
model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
model = subject.new(file_input_stream)
|
16
|
+
model.should be_a(subject)
|
17
|
+
model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::NamedEntityDetector do
|
4
|
+
subject { OpenNlp::Model::NamedEntityDetector }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-ner-time.bin") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
model = subject.new(model_file_name)
|
9
|
+
model.should be_a(subject)
|
10
|
+
model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
model = subject.new(file_input_stream)
|
16
|
+
model.should be_a(subject)
|
17
|
+
model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::POSTagger do
|
4
|
+
subject { OpenNlp::Model::POSTagger }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-pos-maxent.bin") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
model = subject.new(model_file_name)
|
9
|
+
model.should be_a(subject)
|
10
|
+
model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
model = subject.new(file_input_stream)
|
16
|
+
model.should be_a(subject)
|
17
|
+
model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::SentenceDetector do
|
4
|
+
subject { OpenNlp::Model::SentenceDetector }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-sent.bin") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
model = subject.new(model_file_name)
|
9
|
+
model.should be_a(subject)
|
10
|
+
model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
model = subject.new(file_input_stream)
|
16
|
+
model.should be_a(subject)
|
17
|
+
model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Model::Tokenizer do
|
4
|
+
subject { OpenNlp::Model::Tokenizer }
|
5
|
+
let(:model_file_name) { File.join(FIXTURES_DIR, "en-token.bin") }
|
6
|
+
|
7
|
+
it "should accept a string filename parameter" do
|
8
|
+
model = subject.new(model_file_name)
|
9
|
+
model.should be_a(subject)
|
10
|
+
model.j_model.should be_a(subject.java_class_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept a java.io.FileInputStream object" do
|
14
|
+
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
|
+
model = subject.new(file_input_stream)
|
16
|
+
model.should be_a(subject)
|
17
|
+
model.j_model.should be_a(subject.java_class_name)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should raise an argument error otherwise" do
|
21
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::NamedEntityDetector do
|
4
|
+
subject { OpenNlp::NamedEntityDetector }
|
5
|
+
|
6
|
+
let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, "en-ner-time.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize with a valid model" do
|
10
|
+
ne_detector = subject.new(model)
|
11
|
+
ne_detector.should be_a(subject)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an ArgumentError otherwise" do
|
15
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "detection" do
|
20
|
+
let(:ne_detector) { subject.new(model) }
|
21
|
+
|
22
|
+
it "should detect nothing in an empty sentence" do
|
23
|
+
spans = ne_detector.detect([])
|
24
|
+
spans.should be_a(Array)
|
25
|
+
spans.length.should == 0
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should detect the named entities" do
|
29
|
+
spans = ne_detector.detect(["The", "time", "is", "10", ":", "23", "am"])
|
30
|
+
spans.should be_a(Array)
|
31
|
+
spans[0].should be_a(Java::opennlp.tools.util.Span)
|
32
|
+
spans[0].getStart.should == 3
|
33
|
+
spans[0].getEnd.should == 7
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should raise an error if anything but an array is passed" do
|
37
|
+
lambda { ne_detector.detect(nil) }.should raise_error(ArgumentError)
|
38
|
+
lambda { ne_detector.detect('str') }.should raise_error(ArgumentError)
|
39
|
+
lambda { ne_detector.detect(111) }.should raise_error(ArgumentError)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::POSTagger do
|
4
|
+
subject { OpenNlp::POSTagger }
|
5
|
+
|
6
|
+
let(:model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize with a valid model" do
|
10
|
+
tagger = subject.new(model)
|
11
|
+
tagger.should be_a(subject)
|
12
|
+
tagger.j_instance.should be_a(subject.java_class)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should raise an ArgumentError without a valid model" do
|
16
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "pos tagging" do
|
21
|
+
let(:pos_tagger) { subject.new(model) }
|
22
|
+
|
23
|
+
it "should tag parts of a provided document" do
|
24
|
+
tagged = pos_tagger.tag("The quick brown fox jumps over the lazy dog.")
|
25
|
+
tagged.should == "The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN"
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should tag provided tokens" do
|
29
|
+
tagged = pos_tagger.tag(%w(The quick brown fox jumps over the lazy dog .))
|
30
|
+
tagged.to_ary.should == %w(DT JJ JJ NN NNS IN DT JJ NN .)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should raise an ArgumentError for a non-string" do
|
34
|
+
lambda { pos_tagger.tag(nil) }.should raise_error(ArgumentError)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::SentenceDetector do
|
4
|
+
subject { OpenNlp::SentenceDetector }
|
5
|
+
|
6
|
+
let(:model) { OpenNlp::Model::SentenceDetector.new(File.join(FIXTURES_DIR, "en-sent.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize with a valid model" do
|
10
|
+
sent_detector = subject.new(model)
|
11
|
+
sent_detector.should be_a(subject)
|
12
|
+
sent_detector.j_instance.should be_a(subject.java_class)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should raise an ArgumentError without a valid model" do
|
16
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "sentence detection" do
|
21
|
+
let(:sent_detector) { subject.new(model) }
|
22
|
+
|
23
|
+
it "should detect no sentences in an empty string" do
|
24
|
+
sentences = sent_detector.detect("")
|
25
|
+
sentences.should == []
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should detect sentences in a string" do
|
29
|
+
sentences = sent_detector.detect("The sky is blue. The Grass is green.")
|
30
|
+
sentences.should == ["The sky is blue.", "The Grass is green."]
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should raise an ArgumentError for a non-string" do
|
34
|
+
lambda { sent_detector.detect(nil) }.should raise_error(ArgumentError)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Tokenizer do
|
4
|
+
subject { OpenNlp::Tokenizer }
|
5
|
+
|
6
|
+
let(:model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize a new tokenizer" do
|
10
|
+
tokenizer = subject.new(model)
|
11
|
+
tokenizer.should be_a(subject)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an argument error when no model is supplied" do
|
15
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "tokenize a string" do
|
20
|
+
let(:tokenizer) { subject.new(model) }
|
21
|
+
|
22
|
+
it "should tokenize an empty string" do
|
23
|
+
tokens = tokenizer.tokenize("")
|
24
|
+
tokens.should == []
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should tokenize a sentence" do
|
28
|
+
tokens = tokenizer.tokenize("The red fox sleeps soundly.")
|
29
|
+
tokens.should == ["The", "red", "fox", "sleeps", "soundly", "."]
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should raise an error when not passed a string" do
|
33
|
+
lambda { tokenizer.tokenize(nil) }.should raise_error(ArgumentError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: open_nlp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Hck
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-21 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: JRuby tools wrapper for Apache OpenNLP
|
15
|
+
email:
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- .gitignore
|
21
|
+
- Gemfile
|
22
|
+
- LICENSE.txt
|
23
|
+
- README.md
|
24
|
+
- Rakefile
|
25
|
+
- lib/open_nlp.rb
|
26
|
+
- lib/open_nlp/chunker.rb
|
27
|
+
- lib/open_nlp/model.rb
|
28
|
+
- lib/open_nlp/model/chunker.rb
|
29
|
+
- lib/open_nlp/model/detokenizer.rb
|
30
|
+
- lib/open_nlp/model/named_entity_detector.rb
|
31
|
+
- lib/open_nlp/model/pos_tagger.rb
|
32
|
+
- lib/open_nlp/model/sentence_detector.rb
|
33
|
+
- lib/open_nlp/model/tokenizer.rb
|
34
|
+
- lib/open_nlp/named_entity_detector.rb
|
35
|
+
- lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
|
36
|
+
- lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
|
37
|
+
- lib/open_nlp/pos_tagger.rb
|
38
|
+
- lib/open_nlp/sentence_detector.rb
|
39
|
+
- lib/open_nlp/tokenizer.rb
|
40
|
+
- lib/open_nlp/tool.rb
|
41
|
+
- lib/open_nlp/version.rb
|
42
|
+
- open_nlp.gemspec
|
43
|
+
- spec/chunker_spec.rb
|
44
|
+
- spec/fixtures/en-chunker.bin
|
45
|
+
- spec/fixtures/en-detokenizer.xml
|
46
|
+
- spec/fixtures/en-ner-time.bin
|
47
|
+
- spec/fixtures/en-pos-maxent.bin
|
48
|
+
- spec/fixtures/en-sent.bin
|
49
|
+
- spec/fixtures/en-token.bin
|
50
|
+
- spec/model/chunker_spec.rb
|
51
|
+
- spec/model/detokenizer_spec.rb
|
52
|
+
- spec/model/named_entity_detector_spec.rb
|
53
|
+
- spec/model/pos_tagger_spec.rb
|
54
|
+
- spec/model/sentence_detector_spec.rb
|
55
|
+
- spec/model/tokenizer_spec.rb
|
56
|
+
- spec/named_entity_detector_spec.rb
|
57
|
+
- spec/pos_tagger_spec.rb
|
58
|
+
- spec/sentence_detector_spec.rb
|
59
|
+
- spec/spec_helper.rb
|
60
|
+
- spec/tokenizer_spec.rb
|
61
|
+
homepage:
|
62
|
+
licenses: []
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: !binary |-
|
72
|
+
MA==
|
73
|
+
none: false
|
74
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ! '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: !binary |-
|
79
|
+
MA==
|
80
|
+
none: false
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 1.8.24
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: A JRuby wrapper for the Apache OpenNLP tools library
|
87
|
+
test_files:
|
88
|
+
- spec/chunker_spec.rb
|
89
|
+
- spec/fixtures/en-chunker.bin
|
90
|
+
- spec/fixtures/en-detokenizer.xml
|
91
|
+
- spec/fixtures/en-ner-time.bin
|
92
|
+
- spec/fixtures/en-pos-maxent.bin
|
93
|
+
- spec/fixtures/en-sent.bin
|
94
|
+
- spec/fixtures/en-token.bin
|
95
|
+
- spec/model/chunker_spec.rb
|
96
|
+
- spec/model/detokenizer_spec.rb
|
97
|
+
- spec/model/named_entity_detector_spec.rb
|
98
|
+
- spec/model/pos_tagger_spec.rb
|
99
|
+
- spec/model/sentence_detector_spec.rb
|
100
|
+
- spec/model/tokenizer_spec.rb
|
101
|
+
- spec/named_entity_detector_spec.rb
|
102
|
+
- spec/pos_tagger_spec.rb
|
103
|
+
- spec/sentence_detector_spec.rb
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- spec/tokenizer_spec.rb
|