open_nlp 0.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +1 -0
  6. data/lib/open_nlp/chunker.rb +42 -0
  7. data/lib/open_nlp/model/chunker.rb +3 -0
  8. data/lib/open_nlp/model/detokenizer.rb +3 -0
  9. data/lib/open_nlp/model/named_entity_detector.rb +3 -0
  10. data/lib/open_nlp/model/pos_tagger.rb +3 -0
  11. data/lib/open_nlp/model/sentence_detector.rb +3 -0
  12. data/lib/open_nlp/model/tokenizer.rb +3 -0
  13. data/lib/open_nlp/model.rb +28 -0
  14. data/lib/open_nlp/named_entity_detector.rb +10 -0
  15. data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
  16. data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
  17. data/lib/open_nlp/pos_tagger.rb +10 -0
  18. data/lib/open_nlp/sentence_detector.rb +10 -0
  19. data/lib/open_nlp/tokenizer.rb +10 -0
  20. data/lib/open_nlp/tool.rb +20 -0
  21. data/lib/open_nlp/version.rb +3 -0
  22. data/lib/open_nlp.rb +20 -0
  23. data/open_nlp.gemspec +19 -0
  24. data/spec/chunker_spec.rb +38 -0
  25. data/spec/fixtures/en-chunker.bin +0 -0
  26. data/spec/fixtures/en-detokenizer.xml +107 -0
  27. data/spec/fixtures/en-ner-time.bin +0 -0
  28. data/spec/fixtures/en-pos-maxent.bin +0 -0
  29. data/spec/fixtures/en-sent.bin +0 -0
  30. data/spec/fixtures/en-token.bin +0 -0
  31. data/spec/model/chunker_spec.rb +23 -0
  32. data/spec/model/detokenizer_spec.rb +23 -0
  33. data/spec/model/named_entity_detector_spec.rb +23 -0
  34. data/spec/model/pos_tagger_spec.rb +23 -0
  35. data/spec/model/sentence_detector_spec.rb +23 -0
  36. data/spec/model/tokenizer_spec.rb +23 -0
  37. data/spec/named_entity_detector_spec.rb +42 -0
  38. data/spec/pos_tagger_spec.rb +37 -0
  39. data/spec/sentence_detector_spec.rb +37 -0
  40. data/spec/spec_helper.rb +5 -0
  41. data/spec/tokenizer_spec.rb +36 -0
  42. metadata +105 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in open_nlp.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Alexander Svirin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # OpenNlp
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'open_nlp'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install open_nlp
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,42 @@
1
+ module OpenNlp
2
+ class Chunker < Tool
3
+ self.java_class = Java::opennlp.tools.chunker.ChunkerME
4
+
5
+ def initialize(model, token_model, pos_model)
6
+ #raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
7
+ super(model)
8
+
9
+ raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
10
+ raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger)
11
+
12
+ @tokenizer = Tokenizer.new(token_model)
13
+ @pos_tagger = POSTagger.new(pos_model)
14
+ #@j_instance = self.java_class.new(model.j_model)
15
+ end
16
+
17
+ def chunk(str)
18
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
19
+
20
+ tokens = @tokenizer.tokenize(str)
21
+ pos_tags = @pos_tagger.tag(tokens).to_ary
22
+
23
+ chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
24
+
25
+ build_chunks(chunks, tokens, pos_tags)
26
+ end
27
+
28
+ private
29
+ def build_chunks(chunks, tokens, pos_tags)
30
+ # data[i] = [token, pos_tag, chunk_val]
31
+ data = tokens.zip(pos_tags, chunks)
32
+
33
+ data.inject([]) do |acc, val|
34
+ chunk = val[2]
35
+ acc << [val[0]] if chunk[0] == 'B'
36
+ acc.last << val[0] if chunk[0] == 'I'
37
+
38
+ acc
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Chunker < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.chunker.ChunkerModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Detokenizer < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.tokenize.DetokenizationDictionary
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::NamedEntityDetector < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.namefind.TokenNameFinderModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::POSTagger < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.postag.POSModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::SentenceDetector < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.sentdetect.SentenceModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Tokenizer < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.tokenize.TokenizerModel
3
+ end
@@ -0,0 +1,28 @@
1
+ module OpenNlp
2
+ class Model
3
+ attr_reader :j_model
4
+
5
+ def initialize(model)
6
+ model_stream = case model
7
+ when java.io.FileInputStream
8
+ model
9
+ when String
10
+ java.io.FileInputStream.new(model)
11
+ else
12
+ raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
13
+ end
14
+
15
+ @j_model = self.class.java_class_name.new(model_stream)
16
+ end
17
+
18
+ class << self
19
+ def java_class_name=(value)
20
+ @java_class = value
21
+ end
22
+
23
+ def java_class_name
24
+ @java_class
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class NamedEntityDetector < Tool
3
+ self.java_class = Java::opennlp.tools.namefind.NameFinderME
4
+
5
+ def detect(tokens)
6
+ raise ArgumentError, "tokens must be an instance of Array" unless tokens.is_a?(Array)
7
+ @j_instance.find(tokens.to_java(:String)).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class POSTagger < Tool
3
+ self.java_class = Java::opennlp.tools.postag.POSTaggerME
4
+
5
+ def tag(tokens)
6
+ raise ArgumentError, "tokens must be an instance of String or Array" unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
+ @j_instance.tag(tokens.to_java(:String))
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class SentenceDetector < Tool
3
+ self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
4
+
5
+ def detect(string)
6
+ raise ArgumentError, "string must be a String" unless string.is_a?(String)
7
+ @j_instance.sentDetect(string).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class Tokenizer < Tool
3
+ self.java_class = Java::opennlp.tools.tokenize.TokenizerME
4
+
5
+ def tokenize(str)
6
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
+ @j_instance.tokenize(str).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,20 @@
1
+ module OpenNlp
2
+ class Tool
3
+ attr_reader :j_instance
4
+
5
+ def initialize(model)
6
+ raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
7
+ @j_instance = self.class.java_class.new(model.j_model)
8
+ end
9
+
10
+ class << self
11
+ def java_class=(value)
12
+ @java_class = value
13
+ end
14
+
15
+ def java_class
16
+ @java_class
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module OpenNlp
2
+ VERSION = '0.0.1'
3
+ end
data/lib/open_nlp.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'open_nlp/version'
2
+
3
+ require 'java'
4
+ require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
+ require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
6
+
7
+ require 'open_nlp/model'
8
+ require 'open_nlp/model/chunker'
9
+ require 'open_nlp/model/detokenizer'
10
+ require 'open_nlp/model/named_entity_detector'
11
+ require 'open_nlp/model/pos_tagger'
12
+ require 'open_nlp/model/sentence_detector'
13
+ require 'open_nlp/model/tokenizer'
14
+
15
+ require 'open_nlp/tool'
16
+ require 'open_nlp/named_entity_detector'
17
+ require 'open_nlp/pos_tagger'
18
+ require 'open_nlp/sentence_detector'
19
+ require 'open_nlp/tokenizer'
20
+ require 'open_nlp/chunker'
data/open_nlp.gemspec ADDED
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'open_nlp/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "open_nlp"
8
+ gem.version = OpenNlp::VERSION
9
+ gem.authors = ["Hck"]
10
+ gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
11
+ gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
12
+
13
+ gem.files = `git ls-files`.split($/)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.platform = "java"
19
+ end
@@ -0,0 +1,38 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Chunker do
4
+ subject { OpenNlp::Chunker }
5
+
6
+ let(:model) { OpenNlp::Model::Chunker.new(File.join(FIXTURES_DIR, "en-chunker.bin")) }
7
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
8
+ let(:pos_model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
9
+
10
+ describe "initialization" do
11
+ it "should initialize a new chunker" do
12
+ chunker = subject.new(model, token_model, pos_model)
13
+ chunker.should be_a(subject)
14
+ end
15
+
16
+ it "should raise an argument error when no model is supplied" do
17
+ lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
18
+ end
19
+ end
20
+
21
+ describe "chunking a string" do
22
+ let(:chunker) { subject.new(model, token_model, pos_model) }
23
+
24
+ it "should chunk an empty string" do
25
+ chunks = chunker.chunk("")
26
+ chunks.should == []
27
+ end
28
+
29
+ it "should chunk a sentence" do
30
+ chunks = chunker.chunk("The red fox sleeps soundly.")
31
+ chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
32
+ end
33
+
34
+ it "should raise an error when not passed a string" do
35
+ lambda { chunker.chunk(nil) }.should raise_error(ArgumentError)
36
+ end
37
+ end
38
+ end
Binary file
@@ -0,0 +1,107 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+
3
+ <!--
4
+ Licensed to the Apache Software Foundation (ASF) under one
5
+ or more contributor license agreements. See the NOTICE file
6
+ distributed with this work for additional information
7
+ regarding copyright ownership. The ASF licenses this file
8
+ to you under the Apache License, Version 2.0 (the
9
+ "License"); you may not use this file except in compliance
10
+ with the License. You may obtain a copy of the License at
11
+
12
+ http://www.apache.org/licenses/LICENSE-2.0
13
+
14
+ Unless required by applicable law or agreed to in writing,
15
+ software distributed under the License is distributed on an
16
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17
+ KIND, either express or implied. See the License for the
18
+ specific language governing permissions and limitations
19
+ under the License.
20
+ -->
21
+
22
+ <dictionary>
23
+ <entry operation="RIGHT_LEFT_MATCHING">
24
+ <token>"</token>
25
+ </entry>
26
+ <entry operation="RIGHT_LEFT_MATCHING">
27
+ <token>'</token>
28
+ </entry>
29
+ <entry operation="MOVE_LEFT">
30
+ <token>.</token>
31
+ </entry>
32
+ <entry operation="MOVE_LEFT">
33
+ <token>?</token>
34
+ </entry>
35
+ <entry operation="MOVE_LEFT">
36
+ <token>!</token>
37
+ </entry>
38
+ <entry operation="MOVE_LEFT">
39
+ <token>,</token>
40
+ </entry>
41
+ <entry operation="MOVE_LEFT">
42
+ <token>;</token>
43
+ </entry>
44
+ <entry operation="MOVE_LEFT">
45
+ <token>:</token>
46
+ </entry>
47
+ <entry operation="MOVE_RIGHT">
48
+ <token>(</token>
49
+ </entry>
50
+ <entry operation="MOVE_LEFT">
51
+ <token>)</token>
52
+ </entry>
53
+ <entry operation="MOVE_LEFT">
54
+ <token>}</token>
55
+ </entry>
56
+ <entry operation="MOVE_RIGHT">
57
+ <token>{</token>
58
+ </entry>
59
+ <entry operation="MOVE_LEFT">
60
+ <token>]</token>
61
+ </entry>
62
+ <entry operation="MOVE_RIGHT">
63
+ <token>[</token>
64
+ </entry>
65
+ <entry operation="MOVE_RIGHT">
66
+ <token>``</token>
67
+ </entry>
68
+ <entry operation="MOVE_LEFT">
69
+ <token>''</token>
70
+ </entry>
71
+ <entry operation="MOVE_LEFT">
72
+ <token>%</token>
73
+ </entry>
74
+ <entry operation="MOVE_LEFT">
75
+ <token>n't</token>
76
+ </entry>
77
+ <entry operation="MOVE_LEFT">
78
+ <token>'ve</token>
79
+ </entry>
80
+ <entry operation="MOVE_LEFT">
81
+ <token>'d</token>
82
+ </entry>
83
+ <entry operation="MOVE_LEFT">
84
+ <token>'ll</token>
85
+ </entry>
86
+ <entry operation="MOVE_LEFT">
87
+ <token>'s</token>
88
+ </entry>
89
+ <entry operation="MOVE_LEFT">
90
+ <token>'re</token>
91
+ </entry>
92
+ <entry operation="MOVE_LEFT">
93
+ <token>'m</token>
94
+ </entry>
95
+ <entry operation="MOVE_LEFT">
96
+ <token>.org</token>
97
+ </entry>
98
+ <entry operation="MOVE_LEFT">
99
+ <token>.com</token>
100
+ </entry>
101
+ <entry operation="MOVE_LEFT">
102
+ <token>.net</token>
103
+ </entry>
104
+ <entry operation="MOVE_RIGHT">
105
+ <token>#</token>
106
+ </entry>
107
+ </dictionary>
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Chunker do
4
+ subject { OpenNlp::Model::Chunker }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-chunker.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ chunker_model = subject.new(model_file_name)
9
+ chunker_model.should be_a(subject)
10
+ chunker_model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ chunker_model = subject.new(file_input_stream)
16
+ chunker_model.should be_a(subject)
17
+ chunker_model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Detokenizer do
4
+ subject { OpenNlp::Model::Detokenizer }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-detokenizer.xml") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::NamedEntityDetector do
4
+ subject { OpenNlp::Model::NamedEntityDetector }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-ner-time.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::POSTagger do
4
+ subject { OpenNlp::Model::POSTagger }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-pos-maxent.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::SentenceDetector do
4
+ subject { OpenNlp::Model::SentenceDetector }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-sent.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Tokenizer do
4
+ subject { OpenNlp::Model::Tokenizer }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-token.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,42 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::NamedEntityDetector do
4
+ subject { OpenNlp::NamedEntityDetector }
5
+
6
+ let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, "en-ner-time.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ ne_detector = subject.new(model)
11
+ ne_detector.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an ArgumentError otherwise" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "detection" do
20
+ let(:ne_detector) { subject.new(model) }
21
+
22
+ it "should detect nothing in an empty sentence" do
23
+ spans = ne_detector.detect([])
24
+ spans.should be_a(Array)
25
+ spans.length.should == 0
26
+ end
27
+
28
+ it "should detect the named entities" do
29
+ spans = ne_detector.detect(["The", "time", "is", "10", ":", "23", "am"])
30
+ spans.should be_a(Array)
31
+ spans[0].should be_a(Java::opennlp.tools.util.Span)
32
+ spans[0].getStart.should == 3
33
+ spans[0].getEnd.should == 7
34
+ end
35
+
36
+ it "should raise an error if anything but an array is passed" do
37
+ lambda { ne_detector.detect(nil) }.should raise_error(ArgumentError)
38
+ lambda { ne_detector.detect('str') }.should raise_error(ArgumentError)
39
+ lambda { ne_detector.detect(111) }.should raise_error(ArgumentError)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::POSTagger do
4
+ subject { OpenNlp::POSTagger }
5
+
6
+ let(:model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ tagger = subject.new(model)
11
+ tagger.should be_a(subject)
12
+ tagger.j_instance.should be_a(subject.java_class)
13
+ end
14
+
15
+ it "should raise an ArgumentError without a valid model" do
16
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
17
+ end
18
+ end
19
+
20
+ describe "pos tagging" do
21
+ let(:pos_tagger) { subject.new(model) }
22
+
23
+ it "should tag parts of a provided document" do
24
+ tagged = pos_tagger.tag("The quick brown fox jumps over the lazy dog.")
25
+ tagged.should == "The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN"
26
+ end
27
+
28
+ it "should tag provided tokens" do
29
+ tagged = pos_tagger.tag(%w(The quick brown fox jumps over the lazy dog .))
30
+ tagged.to_ary.should == %w(DT JJ JJ NN NNS IN DT JJ NN .)
31
+ end
32
+
33
+ it "should raise an ArgumentError for a non-string" do
34
+ lambda { pos_tagger.tag(nil) }.should raise_error(ArgumentError)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::SentenceDetector do
4
+ subject { OpenNlp::SentenceDetector }
5
+
6
+ let(:model) { OpenNlp::Model::SentenceDetector.new(File.join(FIXTURES_DIR, "en-sent.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ sent_detector = subject.new(model)
11
+ sent_detector.should be_a(subject)
12
+ sent_detector.j_instance.should be_a(subject.java_class)
13
+ end
14
+
15
+ it "should raise an ArgumentError without a valid model" do
16
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
17
+ end
18
+ end
19
+
20
+ describe "sentence detection" do
21
+ let(:sent_detector) { subject.new(model) }
22
+
23
+ it "should detect no sentences in an empty string" do
24
+ sentences = sent_detector.detect("")
25
+ sentences.should == []
26
+ end
27
+
28
+ it "should detect sentences in a string" do
29
+ sentences = sent_detector.detect("The sky is blue. The Grass is green.")
30
+ sentences.should == ["The sky is blue.", "The Grass is green."]
31
+ end
32
+
33
+ it "should raise an ArgumentError for a non-string" do
34
+ lambda { sent_detector.detect(nil) }.should raise_error(ArgumentError)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'java'
3
+ require 'open_nlp'
4
+
5
+ FIXTURES_DIR = File.join(File.dirname(__FILE__), "fixtures")
@@ -0,0 +1,36 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Tokenizer do
4
+ subject { OpenNlp::Tokenizer }
5
+
6
+ let(:model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize a new tokenizer" do
10
+ tokenizer = subject.new(model)
11
+ tokenizer.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an argument error when no model is supplied" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "tokenize a string" do
20
+ let(:tokenizer) { subject.new(model) }
21
+
22
+ it "should tokenize an empty string" do
23
+ tokens = tokenizer.tokenize("")
24
+ tokens.should == []
25
+ end
26
+
27
+ it "should tokenize a sentence" do
28
+ tokens = tokenizer.tokenize("The red fox sleeps soundly.")
29
+ tokens.should == ["The", "red", "fox", "sleeps", "soundly", "."]
30
+ end
31
+
32
+ it "should raise an error when not passed a string" do
33
+ lambda { tokenizer.tokenize(nil) }.should raise_error(ArgumentError)
34
+ end
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: open_nlp
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: java
7
+ authors:
8
+ - Hck
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-21 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: JRuby tools wrapper for Apache OpenNLP
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - .gitignore
21
+ - Gemfile
22
+ - LICENSE.txt
23
+ - README.md
24
+ - Rakefile
25
+ - lib/open_nlp.rb
26
+ - lib/open_nlp/chunker.rb
27
+ - lib/open_nlp/model.rb
28
+ - lib/open_nlp/model/chunker.rb
29
+ - lib/open_nlp/model/detokenizer.rb
30
+ - lib/open_nlp/model/named_entity_detector.rb
31
+ - lib/open_nlp/model/pos_tagger.rb
32
+ - lib/open_nlp/model/sentence_detector.rb
33
+ - lib/open_nlp/model/tokenizer.rb
34
+ - lib/open_nlp/named_entity_detector.rb
35
+ - lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
36
+ - lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
37
+ - lib/open_nlp/pos_tagger.rb
38
+ - lib/open_nlp/sentence_detector.rb
39
+ - lib/open_nlp/tokenizer.rb
40
+ - lib/open_nlp/tool.rb
41
+ - lib/open_nlp/version.rb
42
+ - open_nlp.gemspec
43
+ - spec/chunker_spec.rb
44
+ - spec/fixtures/en-chunker.bin
45
+ - spec/fixtures/en-detokenizer.xml
46
+ - spec/fixtures/en-ner-time.bin
47
+ - spec/fixtures/en-pos-maxent.bin
48
+ - spec/fixtures/en-sent.bin
49
+ - spec/fixtures/en-token.bin
50
+ - spec/model/chunker_spec.rb
51
+ - spec/model/detokenizer_spec.rb
52
+ - spec/model/named_entity_detector_spec.rb
53
+ - spec/model/pos_tagger_spec.rb
54
+ - spec/model/sentence_detector_spec.rb
55
+ - spec/model/tokenizer_spec.rb
56
+ - spec/named_entity_detector_spec.rb
57
+ - spec/pos_tagger_spec.rb
58
+ - spec/sentence_detector_spec.rb
59
+ - spec/spec_helper.rb
60
+ - spec/tokenizer_spec.rb
61
+ homepage:
62
+ licenses: []
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: !binary |-
72
+ MA==
73
+ none: false
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: !binary |-
79
+ MA==
80
+ none: false
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.24
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: A JRuby wrapper for the Apache OpenNLP tools library
87
+ test_files:
88
+ - spec/chunker_spec.rb
89
+ - spec/fixtures/en-chunker.bin
90
+ - spec/fixtures/en-detokenizer.xml
91
+ - spec/fixtures/en-ner-time.bin
92
+ - spec/fixtures/en-pos-maxent.bin
93
+ - spec/fixtures/en-sent.bin
94
+ - spec/fixtures/en-token.bin
95
+ - spec/model/chunker_spec.rb
96
+ - spec/model/detokenizer_spec.rb
97
+ - spec/model/named_entity_detector_spec.rb
98
+ - spec/model/pos_tagger_spec.rb
99
+ - spec/model/sentence_detector_spec.rb
100
+ - spec/model/tokenizer_spec.rb
101
+ - spec/named_entity_detector_spec.rb
102
+ - spec/pos_tagger_spec.rb
103
+ - spec/sentence_detector_spec.rb
104
+ - spec/spec_helper.rb
105
+ - spec/tokenizer_spec.rb