open_nlp 0.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +1 -0
  6. data/lib/open_nlp/chunker.rb +42 -0
  7. data/lib/open_nlp/model/chunker.rb +3 -0
  8. data/lib/open_nlp/model/detokenizer.rb +3 -0
  9. data/lib/open_nlp/model/named_entity_detector.rb +3 -0
  10. data/lib/open_nlp/model/pos_tagger.rb +3 -0
  11. data/lib/open_nlp/model/sentence_detector.rb +3 -0
  12. data/lib/open_nlp/model/tokenizer.rb +3 -0
  13. data/lib/open_nlp/model.rb +28 -0
  14. data/lib/open_nlp/named_entity_detector.rb +10 -0
  15. data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
  16. data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
  17. data/lib/open_nlp/pos_tagger.rb +10 -0
  18. data/lib/open_nlp/sentence_detector.rb +10 -0
  19. data/lib/open_nlp/tokenizer.rb +10 -0
  20. data/lib/open_nlp/tool.rb +20 -0
  21. data/lib/open_nlp/version.rb +3 -0
  22. data/lib/open_nlp.rb +20 -0
  23. data/open_nlp.gemspec +19 -0
  24. data/spec/chunker_spec.rb +38 -0
  25. data/spec/fixtures/en-chunker.bin +0 -0
  26. data/spec/fixtures/en-detokenizer.xml +107 -0
  27. data/spec/fixtures/en-ner-time.bin +0 -0
  28. data/spec/fixtures/en-pos-maxent.bin +0 -0
  29. data/spec/fixtures/en-sent.bin +0 -0
  30. data/spec/fixtures/en-token.bin +0 -0
  31. data/spec/model/chunker_spec.rb +23 -0
  32. data/spec/model/detokenizer_spec.rb +23 -0
  33. data/spec/model/named_entity_detector_spec.rb +23 -0
  34. data/spec/model/pos_tagger_spec.rb +23 -0
  35. data/spec/model/sentence_detector_spec.rb +23 -0
  36. data/spec/model/tokenizer_spec.rb +23 -0
  37. data/spec/named_entity_detector_spec.rb +42 -0
  38. data/spec/pos_tagger_spec.rb +37 -0
  39. data/spec/sentence_detector_spec.rb +37 -0
  40. data/spec/spec_helper.rb +5 -0
  41. data/spec/tokenizer_spec.rb +36 -0
  42. metadata +105 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in open_nlp.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Alexander Svirin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # OpenNlp
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'open_nlp'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install open_nlp
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,42 @@
1
+ module OpenNlp
2
+ class Chunker < Tool
3
+ self.java_class = Java::opennlp.tools.chunker.ChunkerME
4
+
5
+ def initialize(model, token_model, pos_model)
6
+ #raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
7
+ super(model)
8
+
9
+ raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
10
+ raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger)
11
+
12
+ @tokenizer = Tokenizer.new(token_model)
13
+ @pos_tagger = POSTagger.new(pos_model)
14
+ #@j_instance = self.java_class.new(model.j_model)
15
+ end
16
+
17
+ def chunk(str)
18
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
19
+
20
+ tokens = @tokenizer.tokenize(str)
21
+ pos_tags = @pos_tagger.tag(tokens).to_ary
22
+
23
+ chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
24
+
25
+ build_chunks(chunks, tokens, pos_tags)
26
+ end
27
+
28
+ private
29
+ def build_chunks(chunks, tokens, pos_tags)
30
+ # data[i] = [token, pos_tag, chunk_val]
31
+ data = tokens.zip(pos_tags, chunks)
32
+
33
+ data.inject([]) do |acc, val|
34
+ chunk = val[2]
35
+ acc << [val[0]] if chunk[0] == 'B'
36
+ acc.last << val[0] if chunk[0] == 'I'
37
+
38
+ acc
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Chunker < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.chunker.ChunkerModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Detokenizer < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.tokenize.DetokenizationDictionary
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::NamedEntityDetector < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.namefind.TokenNameFinderModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::POSTagger < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.postag.POSModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::SentenceDetector < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.sentdetect.SentenceModel
3
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Tokenizer < OpenNlp::Model
2
+ self.java_class_name = Java::opennlp.tools.tokenize.TokenizerModel
3
+ end
@@ -0,0 +1,28 @@
1
+ module OpenNlp
2
+ class Model
3
+ attr_reader :j_model
4
+
5
+ def initialize(model)
6
+ model_stream = case model
7
+ when java.io.FileInputStream
8
+ model
9
+ when String
10
+ java.io.FileInputStream.new(model)
11
+ else
12
+ raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
13
+ end
14
+
15
+ @j_model = self.class.java_class_name.new(model_stream)
16
+ end
17
+
18
+ class << self
19
+ def java_class_name=(value)
20
+ @java_class = value
21
+ end
22
+
23
+ def java_class_name
24
+ @java_class
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class NamedEntityDetector < Tool
3
+ self.java_class = Java::opennlp.tools.namefind.NameFinderME
4
+
5
+ def detect(tokens)
6
+ raise ArgumentError, "tokens must be an instance of Array" unless tokens.is_a?(Array)
7
+ @j_instance.find(tokens.to_java(:String)).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class POSTagger < Tool
3
+ self.java_class = Java::opennlp.tools.postag.POSTaggerME
4
+
5
+ def tag(tokens)
6
+ raise ArgumentError, "tokens must be an instance of String or Array" unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
+ @j_instance.tag(tokens.to_java(:String))
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class SentenceDetector < Tool
3
+ self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
4
+
5
+ def detect(string)
6
+ raise ArgumentError, "string must be a String" unless string.is_a?(String)
7
+ @j_instance.sentDetect(string).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module OpenNlp
2
+ class Tokenizer < Tool
3
+ self.java_class = Java::opennlp.tools.tokenize.TokenizerME
4
+
5
+ def tokenize(str)
6
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
+ @j_instance.tokenize(str).to_ary
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,20 @@
1
+ module OpenNlp
2
+ class Tool
3
+ attr_reader :j_instance
4
+
5
+ def initialize(model)
6
+ raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
7
+ @j_instance = self.class.java_class.new(model.j_model)
8
+ end
9
+
10
+ class << self
11
+ def java_class=(value)
12
+ @java_class = value
13
+ end
14
+
15
+ def java_class
16
+ @java_class
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module OpenNlp
2
+ VERSION = '0.0.1'
3
+ end
data/lib/open_nlp.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'open_nlp/version'
2
+
3
+ require 'java'
4
+ require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
+ require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
6
+
7
+ require 'open_nlp/model'
8
+ require 'open_nlp/model/chunker'
9
+ require 'open_nlp/model/detokenizer'
10
+ require 'open_nlp/model/named_entity_detector'
11
+ require 'open_nlp/model/pos_tagger'
12
+ require 'open_nlp/model/sentence_detector'
13
+ require 'open_nlp/model/tokenizer'
14
+
15
+ require 'open_nlp/tool'
16
+ require 'open_nlp/named_entity_detector'
17
+ require 'open_nlp/pos_tagger'
18
+ require 'open_nlp/sentence_detector'
19
+ require 'open_nlp/tokenizer'
20
+ require 'open_nlp/chunker'
data/open_nlp.gemspec ADDED
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'open_nlp/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "open_nlp"
8
+ gem.version = OpenNlp::VERSION
9
+ gem.authors = ["Hck"]
10
+ gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
11
+ gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
12
+
13
+ gem.files = `git ls-files`.split($/)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.platform = "java"
19
+ end
@@ -0,0 +1,38 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Chunker do
4
+ subject { OpenNlp::Chunker }
5
+
6
+ let(:model) { OpenNlp::Model::Chunker.new(File.join(FIXTURES_DIR, "en-chunker.bin")) }
7
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
8
+ let(:pos_model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
9
+
10
+ describe "initialization" do
11
+ it "should initialize a new chunker" do
12
+ chunker = subject.new(model, token_model, pos_model)
13
+ chunker.should be_a(subject)
14
+ end
15
+
16
+ it "should raise an argument error when no model is supplied" do
17
+ lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
18
+ end
19
+ end
20
+
21
+ describe "chunking a string" do
22
+ let(:chunker) { subject.new(model, token_model, pos_model) }
23
+
24
+ it "should chunk an empty string" do
25
+ chunks = chunker.chunk("")
26
+ chunks.should == []
27
+ end
28
+
29
+ it "should chunk a sentence" do
30
+ chunks = chunker.chunk("The red fox sleeps soundly.")
31
+ chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
32
+ end
33
+
34
+ it "should raise an error when not passed a string" do
35
+ lambda { chunker.chunk(nil) }.should raise_error(ArgumentError)
36
+ end
37
+ end
38
+ end
Binary file
@@ -0,0 +1,107 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+
3
+ <!--
4
+ Licensed to the Apache Software Foundation (ASF) under one
5
+ or more contributor license agreements. See the NOTICE file
6
+ distributed with this work for additional information
7
+ regarding copyright ownership. The ASF licenses this file
8
+ to you under the Apache License, Version 2.0 (the
9
+ "License"); you may not use this file except in compliance
10
+ with the License. You may obtain a copy of the License at
11
+
12
+ http://www.apache.org/licenses/LICENSE-2.0
13
+
14
+ Unless required by applicable law or agreed to in writing,
15
+ software distributed under the License is distributed on an
16
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17
+ KIND, either express or implied. See the License for the
18
+ specific language governing permissions and limitations
19
+ under the License.
20
+ -->
21
+
22
+ <dictionary>
23
+ <entry operation="RIGHT_LEFT_MATCHING">
24
+ <token>"</token>
25
+ </entry>
26
+ <entry operation="RIGHT_LEFT_MATCHING">
27
+ <token>'</token>
28
+ </entry>
29
+ <entry operation="MOVE_LEFT">
30
+ <token>.</token>
31
+ </entry>
32
+ <entry operation="MOVE_LEFT">
33
+ <token>?</token>
34
+ </entry>
35
+ <entry operation="MOVE_LEFT">
36
+ <token>!</token>
37
+ </entry>
38
+ <entry operation="MOVE_LEFT">
39
+ <token>,</token>
40
+ </entry>
41
+ <entry operation="MOVE_LEFT">
42
+ <token>;</token>
43
+ </entry>
44
+ <entry operation="MOVE_LEFT">
45
+ <token>:</token>
46
+ </entry>
47
+ <entry operation="MOVE_RIGHT">
48
+ <token>(</token>
49
+ </entry>
50
+ <entry operation="MOVE_LEFT">
51
+ <token>)</token>
52
+ </entry>
53
+ <entry operation="MOVE_LEFT">
54
+ <token>}</token>
55
+ </entry>
56
+ <entry operation="MOVE_RIGHT">
57
+ <token>{</token>
58
+ </entry>
59
+ <entry operation="MOVE_LEFT">
60
+ <token>]</token>
61
+ </entry>
62
+ <entry operation="MOVE_RIGHT">
63
+ <token>[</token>
64
+ </entry>
65
+ <entry operation="MOVE_RIGHT">
66
+ <token>``</token>
67
+ </entry>
68
+ <entry operation="MOVE_LEFT">
69
+ <token>''</token>
70
+ </entry>
71
+ <entry operation="MOVE_LEFT">
72
+ <token>%</token>
73
+ </entry>
74
+ <entry operation="MOVE_LEFT">
75
+ <token>n't</token>
76
+ </entry>
77
+ <entry operation="MOVE_LEFT">
78
+ <token>'ve</token>
79
+ </entry>
80
+ <entry operation="MOVE_LEFT">
81
+ <token>'d</token>
82
+ </entry>
83
+ <entry operation="MOVE_LEFT">
84
+ <token>'ll</token>
85
+ </entry>
86
+ <entry operation="MOVE_LEFT">
87
+ <token>'s</token>
88
+ </entry>
89
+ <entry operation="MOVE_LEFT">
90
+ <token>'re</token>
91
+ </entry>
92
+ <entry operation="MOVE_LEFT">
93
+ <token>'m</token>
94
+ </entry>
95
+ <entry operation="MOVE_LEFT">
96
+ <token>.org</token>
97
+ </entry>
98
+ <entry operation="MOVE_LEFT">
99
+ <token>.com</token>
100
+ </entry>
101
+ <entry operation="MOVE_LEFT">
102
+ <token>.net</token>
103
+ </entry>
104
+ <entry operation="MOVE_RIGHT">
105
+ <token>#</token>
106
+ </entry>
107
+ </dictionary>
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Chunker do
4
+ subject { OpenNlp::Model::Chunker }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-chunker.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ chunker_model = subject.new(model_file_name)
9
+ chunker_model.should be_a(subject)
10
+ chunker_model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ chunker_model = subject.new(file_input_stream)
16
+ chunker_model.should be_a(subject)
17
+ chunker_model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Detokenizer do
4
+ subject { OpenNlp::Model::Detokenizer }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-detokenizer.xml") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::NamedEntityDetector do
4
+ subject { OpenNlp::Model::NamedEntityDetector }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-ner-time.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::POSTagger do
4
+ subject { OpenNlp::Model::POSTagger }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-pos-maxent.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::SentenceDetector do
4
+ subject { OpenNlp::Model::SentenceDetector }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-sent.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Model::Tokenizer do
4
+ subject { OpenNlp::Model::Tokenizer }
5
+ let(:model_file_name) { File.join(FIXTURES_DIR, "en-token.bin") }
6
+
7
+ it "should accept a string filename parameter" do
8
+ model = subject.new(model_file_name)
9
+ model.should be_a(subject)
10
+ model.j_model.should be_a(subject.java_class_name)
11
+ end
12
+
13
+ it "should accept a java.io.FileInputStream object" do
14
+ file_input_stream = java.io.FileInputStream.new(model_file_name)
15
+ model = subject.new(file_input_stream)
16
+ model.should be_a(subject)
17
+ model.j_model.should be_a(subject.java_class_name)
18
+ end
19
+
20
+ it "should raise an argument error otherwise" do
21
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
22
+ end
23
+ end
@@ -0,0 +1,42 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::NamedEntityDetector do
4
+ subject { OpenNlp::NamedEntityDetector }
5
+
6
+ let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, "en-ner-time.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ ne_detector = subject.new(model)
11
+ ne_detector.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an ArgumentError otherwise" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "detection" do
20
+ let(:ne_detector) { subject.new(model) }
21
+
22
+ it "should detect nothing in an empty sentence" do
23
+ spans = ne_detector.detect([])
24
+ spans.should be_a(Array)
25
+ spans.length.should == 0
26
+ end
27
+
28
+ it "should detect the named entities" do
29
+ spans = ne_detector.detect(["The", "time", "is", "10", ":", "23", "am"])
30
+ spans.should be_a(Array)
31
+ spans[0].should be_a(Java::opennlp.tools.util.Span)
32
+ spans[0].getStart.should == 3
33
+ spans[0].getEnd.should == 7
34
+ end
35
+
36
+ it "should raise an error if anything but an array is passed" do
37
+ lambda { ne_detector.detect(nil) }.should raise_error(ArgumentError)
38
+ lambda { ne_detector.detect('str') }.should raise_error(ArgumentError)
39
+ lambda { ne_detector.detect(111) }.should raise_error(ArgumentError)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::POSTagger do
4
+ subject { OpenNlp::POSTagger }
5
+
6
+ let(:model) { OpenNlp::Model::POSTagger.new(File.join(FIXTURES_DIR, "en-pos-maxent.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ tagger = subject.new(model)
11
+ tagger.should be_a(subject)
12
+ tagger.j_instance.should be_a(subject.java_class)
13
+ end
14
+
15
+ it "should raise an ArgumentError without a valid model" do
16
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
17
+ end
18
+ end
19
+
20
+ describe "pos tagging" do
21
+ let(:pos_tagger) { subject.new(model) }
22
+
23
+ it "should tag parts of a provided document" do
24
+ tagged = pos_tagger.tag("The quick brown fox jumps over the lazy dog.")
25
+ tagged.should == "The/DT quick/JJ brown/JJ fox/NN jumps/NNS over/IN the/DT lazy/JJ dog./NN"
26
+ end
27
+
28
+ it "should tag provided tokens" do
29
+ tagged = pos_tagger.tag(%w(The quick brown fox jumps over the lazy dog .))
30
+ tagged.to_ary.should == %w(DT JJ JJ NN NNS IN DT JJ NN .)
31
+ end
32
+
33
+ it "should raise an ArgumentError for a non-string" do
34
+ lambda { pos_tagger.tag(nil) }.should raise_error(ArgumentError)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::SentenceDetector do
4
+ subject { OpenNlp::SentenceDetector }
5
+
6
+ let(:model) { OpenNlp::Model::SentenceDetector.new(File.join(FIXTURES_DIR, "en-sent.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize with a valid model" do
10
+ sent_detector = subject.new(model)
11
+ sent_detector.should be_a(subject)
12
+ sent_detector.j_instance.should be_a(subject.java_class)
13
+ end
14
+
15
+ it "should raise an ArgumentError without a valid model" do
16
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
17
+ end
18
+ end
19
+
20
+ describe "sentence detection" do
21
+ let(:sent_detector) { subject.new(model) }
22
+
23
+ it "should detect no sentences in an empty string" do
24
+ sentences = sent_detector.detect("")
25
+ sentences.should == []
26
+ end
27
+
28
+ it "should detect sentences in a string" do
29
+ sentences = sent_detector.detect("The sky is blue. The Grass is green.")
30
+ sentences.should == ["The sky is blue.", "The Grass is green."]
31
+ end
32
+
33
+ it "should raise an ArgumentError for a non-string" do
34
+ lambda { sent_detector.detect(nil) }.should raise_error(ArgumentError)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'java'
3
+ require 'open_nlp'
4
+
5
+ FIXTURES_DIR = File.join(File.dirname(__FILE__), "fixtures")
@@ -0,0 +1,36 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Tokenizer do
4
+ subject { OpenNlp::Tokenizer }
5
+
6
+ let(:model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize a new tokenizer" do
10
+ tokenizer = subject.new(model)
11
+ tokenizer.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an argument error when no model is supplied" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "tokenize a string" do
20
+ let(:tokenizer) { subject.new(model) }
21
+
22
+ it "should tokenize an empty string" do
23
+ tokens = tokenizer.tokenize("")
24
+ tokens.should == []
25
+ end
26
+
27
+ it "should tokenize a sentence" do
28
+ tokens = tokenizer.tokenize("The red fox sleeps soundly.")
29
+ tokens.should == ["The", "red", "fox", "sleeps", "soundly", "."]
30
+ end
31
+
32
+ it "should raise an error when not passed a string" do
33
+ lambda { tokenizer.tokenize(nil) }.should raise_error(ArgumentError)
34
+ end
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: open_nlp
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: java
7
+ authors:
8
+ - Hck
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-21 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: JRuby tools wrapper for Apache OpenNLP
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - .gitignore
21
+ - Gemfile
22
+ - LICENSE.txt
23
+ - README.md
24
+ - Rakefile
25
+ - lib/open_nlp.rb
26
+ - lib/open_nlp/chunker.rb
27
+ - lib/open_nlp/model.rb
28
+ - lib/open_nlp/model/chunker.rb
29
+ - lib/open_nlp/model/detokenizer.rb
30
+ - lib/open_nlp/model/named_entity_detector.rb
31
+ - lib/open_nlp/model/pos_tagger.rb
32
+ - lib/open_nlp/model/sentence_detector.rb
33
+ - lib/open_nlp/model/tokenizer.rb
34
+ - lib/open_nlp/named_entity_detector.rb
35
+ - lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
36
+ - lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
37
+ - lib/open_nlp/pos_tagger.rb
38
+ - lib/open_nlp/sentence_detector.rb
39
+ - lib/open_nlp/tokenizer.rb
40
+ - lib/open_nlp/tool.rb
41
+ - lib/open_nlp/version.rb
42
+ - open_nlp.gemspec
43
+ - spec/chunker_spec.rb
44
+ - spec/fixtures/en-chunker.bin
45
+ - spec/fixtures/en-detokenizer.xml
46
+ - spec/fixtures/en-ner-time.bin
47
+ - spec/fixtures/en-pos-maxent.bin
48
+ - spec/fixtures/en-sent.bin
49
+ - spec/fixtures/en-token.bin
50
+ - spec/model/chunker_spec.rb
51
+ - spec/model/detokenizer_spec.rb
52
+ - spec/model/named_entity_detector_spec.rb
53
+ - spec/model/pos_tagger_spec.rb
54
+ - spec/model/sentence_detector_spec.rb
55
+ - spec/model/tokenizer_spec.rb
56
+ - spec/named_entity_detector_spec.rb
57
+ - spec/pos_tagger_spec.rb
58
+ - spec/sentence_detector_spec.rb
59
+ - spec/spec_helper.rb
60
+ - spec/tokenizer_spec.rb
61
+ homepage:
62
+ licenses: []
63
+ post_install_message:
64
+ rdoc_options: []
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: !binary |-
72
+ MA==
73
+ none: false
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: !binary |-
79
+ MA==
80
+ none: false
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.24
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: A JRuby wrapper for the Apache OpenNLP tools library
87
+ test_files:
88
+ - spec/chunker_spec.rb
89
+ - spec/fixtures/en-chunker.bin
90
+ - spec/fixtures/en-detokenizer.xml
91
+ - spec/fixtures/en-ner-time.bin
92
+ - spec/fixtures/en-pos-maxent.bin
93
+ - spec/fixtures/en-sent.bin
94
+ - spec/fixtures/en-token.bin
95
+ - spec/model/chunker_spec.rb
96
+ - spec/model/detokenizer_spec.rb
97
+ - spec/model/named_entity_detector_spec.rb
98
+ - spec/model/pos_tagger_spec.rb
99
+ - spec/model/sentence_detector_spec.rb
100
+ - spec/model/tokenizer_spec.rb
101
+ - spec/named_entity_detector_spec.rb
102
+ - spec/pos_tagger_spec.rb
103
+ - spec/sentence_detector_spec.rb
104
+ - spec/spec_helper.rb
105
+ - spec/tokenizer_spec.rb