open-nlp 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +5 -8
- data/bin/Utils.java +1 -0
- data/bin/utils.jar +0 -0
- data/lib/open-nlp.rb +20 -151
- data/lib/open-nlp/base.rb +49 -0
- data/lib/open-nlp/bindings.rb +133 -0
- data/lib/open-nlp/classes.rb +77 -0
- data/lib/open-nlp/config.rb +2 -2
- data/spec/english_spec.rb +24 -15
- metadata +34 -14
data/README.md
CHANGED
@@ -1,9 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
This is an alpha release. Expect things to break and/or change in the near future! Also, keep the following in mind:
|
4
|
-
|
5
|
-
- Currently, this gem is only tested on JRuby, but support for MRI through Rjb is coming very soon.
|
6
|
-
- Currently, the parser and chunker classes are not working.
|
1
|
+
[![Build Status](https://secure.travis-ci.org/louismullie/open-nlp.png)](http://travis-ci.org/louismullie/open-nlp)
|
7
2
|
|
8
3
|
**About**
|
9
4
|
|
@@ -13,9 +8,11 @@ This gem only provides a thin wrapper over the OpenNLP API. If you are looking f
|
|
13
8
|
|
14
9
|
**Installing**
|
15
10
|
|
11
|
+
_Note: If you are running on MRI, this gem will use the Ruby-Java Bridge (Rjb), which currently does not support Java 7. Therefore, if you have installed Java 7, you should set your JAVA_HOME to point to your old Java 6 install before installing Rjb; for example, `export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk/"`.
|
12
|
+
|
16
13
|
First, install the gem: `gem install open-nlp`. Then, individually download the appropriate models from the [open-nlp website](http://opennlp.sourceforge.net/models-1.5/) or just get [all english language models](louismullie.com/treat/open-nlp-english.zip) in one package (80 MB).
|
17
14
|
|
18
|
-
Place the contents of the extracted archive inside the /bin/ folder of the
|
15
|
+
Place the contents of the extracted archive inside the /bin/ folder of the open-nlp gem (e.g. [...]/gems/open-nlp-0.x.x/bin/).
|
19
16
|
|
20
17
|
**Configuration**
|
21
18
|
|
@@ -43,7 +40,7 @@ OpenNLP.log_file = 'log.txt'
|
|
43
40
|
# OpenNLP.use(:french) # or :german
|
44
41
|
#
|
45
42
|
# Change a specific model file.
|
46
|
-
#
|
43
|
+
# OpenNLP.set_model('pos.model', 'english-left3words-distsim.tagger')
|
47
44
|
```
|
48
45
|
|
49
46
|
**Using the gem**
|
data/bin/Utils.java
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
import java.util.Arrays;
|
2
|
|
1
3
|
public static String[] tagWithArrayList(POSTagger posTagger, ArrayList[] objectArray) {
|
2
4
|
return posTagger.tag(getStringArray(objectArray));
|
3
5
|
}
|
4
6
|
public static Object[] findWithArrayList(NameFinderME nameFinder, ArrayList[] tokens) {
|
5
7
|
return nameFinder.find(getStringArray(tokens));
|
6
8
|
}
|
7
9
|
public static Object[] chunkWithArrays(ChunkerME chunker, ArrayList[] tokens, ArrayList[] tags) {
|
8
10
|
return chunker.chunk(getStringArray(tokens), getStringArray(tags));
|
9
11
|
}
|
10
12
|
public static String[] getStringArray(ArrayList[] objectArray) {
|
11
13
|
String[] stringArray = Arrays.copyOf(objectArray, objectArray.length, String[].class);
|
12
14
|
}
|
data/bin/utils.jar
ADDED
Binary file
|
data/lib/open-nlp.rb
CHANGED
@@ -1,162 +1,31 @@
|
|
1
1
|
module OpenNLP
|
2
|
-
|
3
|
-
# Library version.
|
4
|
-
VERSION = '0.0.1'
|
5
2
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
# ############################ #
|
10
|
-
# BindIt Configuration Options #
|
11
|
-
# ############################ #
|
12
|
-
|
13
|
-
require 'bind-it'
|
14
|
-
extend BindIt::Binding
|
15
|
-
|
16
|
-
# The path in which to look for JAR files, with
|
17
|
-
# a trailing slash (default is gem's bin folder).
|
18
|
-
self.jar_path = File.dirname(__FILE__) + '/../bin/'
|
19
|
-
|
20
|
-
# Load the JVM with a minimum heap size of 512MB,
|
21
|
-
# and a maximum heap size of 1024MB.
|
22
|
-
self.jvm_args = ['-Xms512M', '-Xmx1024M']
|
23
|
-
|
24
|
-
# Turn logging off by default.
|
25
|
-
self.log_file = nil
|
26
|
-
|
27
|
-
# Default JARs to load.
|
28
|
-
self.default_jars = [
|
29
|
-
'jwnl-1.3.3.jar',
|
30
|
-
'opennlp-tools-1.5.2-incubating.jar',
|
31
|
-
'opennlp-maxent-3.0.2-incubating.jar',
|
32
|
-
'opennlp-uima-1.5.2-incubating.jar'
|
33
|
-
]
|
34
|
-
|
35
|
-
# Default namespace.
|
36
|
-
self.default_namespace = 'opennlp.tools'
|
37
|
-
|
38
|
-
# Default classes.
|
39
|
-
self.default_classes = [
|
40
|
-
['DocumentCategorizerME', 'opennlp.tools.doccat'],
|
41
|
-
['ChunkerME', 'opennlp.tools.chunker'],
|
42
|
-
['DictionaryDetokenizer', 'opennlp.tools.tokenize'],
|
43
|
-
['NameFinderME', 'opennlp.tools.namefind'],
|
44
|
-
['Parse', 'opennlp.tools.parser'],
|
45
|
-
['ParserFactory', 'opennlp.tools.parser'],
|
46
|
-
['POSTaggerME', 'opennlp.tools.postag'],
|
47
|
-
['SentenceDetectorME', 'opennlp.tools.sentdetect'],
|
48
|
-
['SimpleTokenizer', 'opennlp.tools.tokenize'],
|
49
|
-
['TokenizerME', 'opennlp.tools.tokenize']
|
50
|
-
]
|
51
|
-
|
52
|
-
# Redefine the Bind-It class loader to redefine
|
53
|
-
# a new constructor for classes that require a model.
|
54
|
-
def self.load_klass(klass, base, name = nil)
|
55
|
-
super(klass,base,name)
|
56
|
-
requires_model = OpenNLP::Config::RequiresModel
|
57
|
-
return unless requires_model.include?(klass)
|
58
|
-
new_class = Class.new(const_get(klass)) do
|
59
|
-
def initialize(file = nil, *args)
|
60
|
-
klass = OpenNLP.last_name(self.class)
|
61
|
-
if !file && !OpenNLP.has_default_model?(klass)
|
62
|
-
raise 'This class intentionally has no default ' +
|
63
|
-
'model. Please supply a file name as an argument ' +
|
64
|
-
'to the class constructor.'
|
65
|
-
else
|
66
|
-
model = OpenNLP.get_model(klass, file)
|
67
|
-
super(*([model] + args))
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
remove_const(klass)
|
72
|
-
const_set(klass, new_class)
|
73
|
-
end
|
74
|
-
|
75
|
-
# Make the bindings.
|
76
|
-
self.bind
|
77
|
-
|
78
|
-
# Load utility classes.
|
79
|
-
self.load_class('FileInputStream', 'java.io')
|
80
|
-
|
81
|
-
# ############################ #
|
82
|
-
# OpenNLP bindings proper #
|
83
|
-
# ############################ #
|
3
|
+
# Library version.
|
4
|
+
VERSION = '0.1.0'
|
84
5
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
# A hash containing the names of loaded models.
|
89
|
-
attr_accessor :model_files
|
90
|
-
# The folder in which to look for models.
|
91
|
-
attr_accessor :model_path
|
92
|
-
# Store the language currently being used.
|
93
|
-
attr_accessor :language
|
94
|
-
end
|
95
|
-
|
96
|
-
# The loaded models.
|
97
|
-
self.models = {}
|
98
|
-
|
99
|
-
# The names of loaded models.
|
100
|
-
self.model_files = {}
|
101
|
-
|
102
|
-
# The path to the main folder containing the folders
|
103
|
-
# with the individual models inside. By default, this
|
104
|
-
# is the same as the JAR path.
|
105
|
-
self.model_path = self.jar_path
|
6
|
+
# Require Java bindings.
|
7
|
+
require 'open-nlp/bindings'
|
8
|
+
OpenNLP::Bindings.bind
|
106
9
|
|
107
|
-
#
|
108
|
-
|
10
|
+
# Require Ruby wrappers.
|
11
|
+
require 'open-nlp/classes'
|
109
12
|
|
110
|
-
#
|
111
|
-
|
112
|
-
|
13
|
+
# Load a Java class into the OpenNLP
|
14
|
+
# namespace (e.g. OpenNLP::Loaded).
|
15
|
+
def load_class(*args)
|
16
|
+
OpenNLP::Bindings.load_class(*args)
|
113
17
|
end
|
114
18
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
def self.has_default_model?(klass)
|
120
|
-
name = OpenNLP::Config::ClassToName[klass]
|
121
|
-
if !OpenNLP::Config::DefaultModels[name]
|
122
|
-
raise 'No default model files are available ' +
|
123
|
-
"for the class #{klass}. Please supply a model " +
|
124
|
-
'as an argument to the constructor.'
|
125
|
-
end
|
126
|
-
!OpenNLP::Config::DefaultModels[name].empty?
|
19
|
+
# Forwards the handling of missing
|
20
|
+
# constants to the Bindings class.
|
21
|
+
def const_missing(const)
|
22
|
+
OpenNLP::Bindings.const_get(const)
|
127
23
|
end
|
128
24
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
'constructor. Please supply a model file ' +
|
134
|
-
'or call OpenNLP.use(:some_language), to ' +
|
135
|
-
'load the default models for a language.'
|
136
|
-
end
|
137
|
-
OpenNLP.load_model(name, file)
|
138
|
-
model = OpenNLP.models[name]
|
139
|
-
end
|
140
|
-
|
141
|
-
def self.load_model(name, file = nil)
|
142
|
-
if self.models[name] && file ==
|
143
|
-
self.model_files[name]
|
144
|
-
return self.models[name]
|
145
|
-
end
|
146
|
-
models = Config::DefaultModels[name]
|
147
|
-
file ||= models[self.language]
|
148
|
-
path = self.model_path + file
|
149
|
-
stream = FileInputStream.new(path)
|
150
|
-
klass = Config::NameToClass[name]
|
151
|
-
load_class(*klass)
|
152
|
-
klass = const_get(klass[0])
|
153
|
-
model = klass.new(stream)
|
154
|
-
self.model_files[name] = file
|
155
|
-
self.models[name] = model
|
25
|
+
# Forward the handling of missing
|
26
|
+
# methods to the Bindings class.
|
27
|
+
def method_missing(sym, *args, &block)
|
28
|
+
OpenNLP::Bindings.send(sym, *args, &block)
|
156
29
|
end
|
157
30
|
|
158
|
-
|
159
|
-
klass.to_s.split('::')[-1]
|
160
|
-
end
|
161
|
-
|
162
|
-
end
|
31
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class OpenNLP::Base
|
2
|
+
|
3
|
+
def initialize(file_or_arg=nil, *args)
|
4
|
+
|
5
|
+
@proxy_class = OpenNLP::Bindings.const_get(last_name)
|
6
|
+
|
7
|
+
if requires_model?
|
8
|
+
if !file_or_arg && !has_default_model?
|
9
|
+
raise "No default model files are available for " +
|
10
|
+
"class #{last_name}. Please supply a model as" +
|
11
|
+
"an argument to the constructor."
|
12
|
+
end
|
13
|
+
@model = OpenNLP::Bindings.get_model(last_name, file_or_arg)
|
14
|
+
@proxy_inst = @proxy_class.new(*([@model] + args))
|
15
|
+
else
|
16
|
+
@proxy_inst = @proxy_class.new(*([*file_or_arg] + args))
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def has_default_model?
|
22
|
+
name = OpenNLP::Config::ClassToName[last_name]
|
23
|
+
!OpenNLP::Config::DefaultModels[name].empty?
|
24
|
+
end
|
25
|
+
|
26
|
+
def requires_model?
|
27
|
+
OpenNLP::Config::RequiresModel.include?(last_name)
|
28
|
+
end
|
29
|
+
|
30
|
+
def last_name
|
31
|
+
self.class.to_s.split('::')[-1]
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def method_missing(sym, *args, &block)
|
36
|
+
@proxy_inst.send(sym, *args, &block)
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
def get_list(tokens)
|
42
|
+
list = OpenNLP::Bindings::ArrayList.new
|
43
|
+
tokens.each do |t|
|
44
|
+
list.add(OpenNLP::Bindings::String.new(t.to_s))
|
45
|
+
end
|
46
|
+
list
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module OpenNLP::Bindings
|
2
|
+
|
3
|
+
# Require configuration.
|
4
|
+
require 'open-nlp/config'
|
5
|
+
|
6
|
+
# ############################ #
|
7
|
+
# BindIt Configuration Options #
|
8
|
+
# ############################ #
|
9
|
+
|
10
|
+
require 'bind-it'
|
11
|
+
extend BindIt::Binding
|
12
|
+
|
13
|
+
# The path in which to look for JAR files, with
|
14
|
+
# a trailing slash (default is gem's bin folder).
|
15
|
+
self.jar_path = File.dirname(__FILE__) + '/../../bin/'
|
16
|
+
|
17
|
+
# Load the JVM with a minimum heap size of 512MB,
|
18
|
+
# and a maximum heap size of 1024MB.
|
19
|
+
self.jvm_args = ['-Xms512M', '-Xmx1024M']
|
20
|
+
|
21
|
+
# Turn logging off by default.
|
22
|
+
self.log_file = nil
|
23
|
+
|
24
|
+
# Default JARs to load.
|
25
|
+
self.default_jars = [
|
26
|
+
'jwnl-1.3.3.jar',
|
27
|
+
'opennlp-tools-1.5.2-incubating.jar',
|
28
|
+
'opennlp-maxent-3.0.2-incubating.jar',
|
29
|
+
'opennlp-uima-1.5.2-incubating.jar'
|
30
|
+
]
|
31
|
+
|
32
|
+
# Default namespace.
|
33
|
+
self.default_namespace = 'opennlp.tools'
|
34
|
+
|
35
|
+
# Default classes.
|
36
|
+
self.default_classes = [
|
37
|
+
['AbstractBottomUpParser', 'opennlp.tools.parser'],
|
38
|
+
['DocumentCategorizerME', 'opennlp.tools.doccat'],
|
39
|
+
['ChunkerME', 'opennlp.tools.chunker'],
|
40
|
+
['DictionaryDetokenizer', 'opennlp.tools.tokenize'],
|
41
|
+
['NameFinderME', 'opennlp.tools.namefind'],
|
42
|
+
['Parser', 'opennlp.tools.parser.chunking'],
|
43
|
+
['Parse', 'opennlp.tools.parser'],
|
44
|
+
['ParserFactory', 'opennlp.tools.parser'],
|
45
|
+
['POSTaggerME', 'opennlp.tools.postag'],
|
46
|
+
['SentenceDetectorME', 'opennlp.tools.sentdetect'],
|
47
|
+
['SimpleTokenizer', 'opennlp.tools.tokenize'],
|
48
|
+
['Span', 'opennlp.tools.util'],
|
49
|
+
['TokenizerME', 'opennlp.tools.tokenize']
|
50
|
+
]
|
51
|
+
|
52
|
+
# Add in Rjb workarounds.
|
53
|
+
unless RUBY_PLATFORM =~ /java/
|
54
|
+
self.default_jars << 'utils.jar'
|
55
|
+
self.default_classes << ['Utils', '']
|
56
|
+
end
|
57
|
+
|
58
|
+
# Make the bindings.
|
59
|
+
self.bind
|
60
|
+
|
61
|
+
# Load utility classes.
|
62
|
+
self.load_class('FileInputStream', 'java.io')
|
63
|
+
self.load_class('String', 'java.lang')
|
64
|
+
self.load_class('ArrayList', 'java.util')
|
65
|
+
|
66
|
+
# ############################ #
|
67
|
+
# OpenNLP bindings proper #
|
68
|
+
# ############################ #
|
69
|
+
|
70
|
+
class <<self
|
71
|
+
# A hash containing loaded models.
|
72
|
+
attr_accessor :models
|
73
|
+
# A hash containing the names of loaded models.
|
74
|
+
attr_accessor :model_files
|
75
|
+
# The folder in which to look for models.
|
76
|
+
attr_accessor :model_path
|
77
|
+
# Store the language currently being used.
|
78
|
+
attr_accessor :language
|
79
|
+
end
|
80
|
+
|
81
|
+
# The loaded models.
|
82
|
+
self.models = {}
|
83
|
+
|
84
|
+
# The names of loaded models.
|
85
|
+
self.model_files = {}
|
86
|
+
|
87
|
+
# The path to the main folder containing the folders
|
88
|
+
# with the individual models inside. By default, this
|
89
|
+
# is the same as the JAR path.
|
90
|
+
self.model_path = self.jar_path
|
91
|
+
|
92
|
+
# Default the language to English.
|
93
|
+
self.language = :english
|
94
|
+
|
95
|
+
# Use a given language for default models.
|
96
|
+
def self.use(language)
|
97
|
+
self.language = language
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.get_model(klass, file=nil)
|
101
|
+
name = OpenNLP::Config::ClassToName[klass]
|
102
|
+
if !self.language and !file
|
103
|
+
raise 'No model file was supplied to the ' +
|
104
|
+
'constructor. Please supply a model file ' +
|
105
|
+
'or call OpenNLP.use(:some_language), to ' +
|
106
|
+
'load the default models for a language.'
|
107
|
+
end
|
108
|
+
self.load_model(name, file)
|
109
|
+
model = self.models[name]
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.set_model
|
113
|
+
raise 'Not implemented.'
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.load_model(name, file = nil)
|
117
|
+
if self.models[name] && file ==
|
118
|
+
self.model_files[name]
|
119
|
+
return self.models[name]
|
120
|
+
end
|
121
|
+
models = OpenNLP::Config::DefaultModels[name]
|
122
|
+
file ||= models[self.language]
|
123
|
+
path = self.model_path + file
|
124
|
+
stream = FileInputStream.new(path)
|
125
|
+
klass = OpenNLP::Config::NameToClass[name]
|
126
|
+
load_class(*klass) unless const_defined?(klass[0])
|
127
|
+
klass = const_get(klass[0])
|
128
|
+
model = klass.new(stream)
|
129
|
+
self.model_files[name] = file
|
130
|
+
self.models[name] = model
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'open-nlp/base'
|
2
|
+
|
3
|
+
class OpenNLP::SentenceDetectorME < OpenNLP::Base; end
|
4
|
+
|
5
|
+
class OpenNLP::SimpleTokenizer < OpenNLP::Base; end
|
6
|
+
|
7
|
+
class OpenNLP::TokenizerME < OpenNLP::Base; end
|
8
|
+
|
9
|
+
class OpenNLP::POSTaggerME < OpenNLP::Base
|
10
|
+
|
11
|
+
unless RUBY_PLATFORM =~ /java/
|
12
|
+
def tag(*args)
|
13
|
+
OpenNLP::Bindings::Utils.tagWithArrayList(@proxy_inst, args[0])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
class OpenNLP::ChunkerME < OpenNLP::Base
|
20
|
+
|
21
|
+
if RUBY_PLATFORM =~ /java/
|
22
|
+
|
23
|
+
def chunk(tokens, tags)
|
24
|
+
if !tokens.is_a?(Array)
|
25
|
+
tokens = tokens.to_a
|
26
|
+
tags = tags.to_a
|
27
|
+
end
|
28
|
+
tokens = tokens.to_java(:String)
|
29
|
+
tags = tags.to_java(:String)
|
30
|
+
@proxy_inst.chunk(tokens,tags).to_a
|
31
|
+
end
|
32
|
+
|
33
|
+
else
|
34
|
+
|
35
|
+
def chunk(tokens, tags)
|
36
|
+
chunks = OpenNLP::Bindings::Utils.chunkWithArrays(@proxy_inst, tokens,tags)
|
37
|
+
chunks.map { |c| c.to_s }
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
class OpenNLP::Parser < OpenNLP::Base
|
45
|
+
|
46
|
+
def parse(text)
|
47
|
+
|
48
|
+
tokenizer = OpenNLP::TokenizerME.new
|
49
|
+
full_span = OpenNLP::Bindings::Span.new(0, text.size)
|
50
|
+
|
51
|
+
parse_obj = OpenNLP::Bindings::Parse.new(
|
52
|
+
text, full_span, "INC", 1, 0)
|
53
|
+
|
54
|
+
tokens = tokenizer.tokenize_pos(text)
|
55
|
+
|
56
|
+
tokens.each_with_index do |tok,i|
|
57
|
+
start, stop = tok.get_start, tok.get_end
|
58
|
+
token = text[start..stop-1]
|
59
|
+
span = OpenNLP::Bindings::Span.new(start, stop)
|
60
|
+
parse = OpenNLP::Bindings::Parse.new(text, span, "TK", 0, i)
|
61
|
+
parse_obj.insert(parse)
|
62
|
+
end
|
63
|
+
|
64
|
+
@proxy_inst.parse(parse_obj)
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
class OpenNLP::NameFinderME < OpenNLP::Base
|
71
|
+
unless RUBY_PLATFORM =~ /java/
|
72
|
+
def find(*args)
|
73
|
+
OpenNLP::Bindings::Utils
|
74
|
+
.findWithArrayList(@proxy_inst, args[0])
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/open-nlp/config.rb
CHANGED
@@ -17,7 +17,7 @@ module OpenNLP::Config
|
|
17
17
|
'DocumentCategorizerME' => :categorizer,
|
18
18
|
'NameFinderME' => :name_finder,
|
19
19
|
'POSTaggerME' => :pos_tagger,
|
20
|
-
'
|
20
|
+
'Parser' => :parser,
|
21
21
|
'SentenceDetectorME' => :sentence_detector,
|
22
22
|
'TokenizerME' => :tokenizer,
|
23
23
|
}
|
@@ -48,7 +48,7 @@ module OpenNLP::Config
|
|
48
48
|
# Classes that require a model as first argument to constructor.
|
49
49
|
RequiresModel = [
|
50
50
|
'SentenceDetectorME', 'NameFinderME', 'DictionaryDetokenizer',
|
51
|
-
'TokenizerME', 'ChunkerME', 'POSTaggerME'
|
51
|
+
'TokenizerME', 'ChunkerME', 'POSTaggerME', 'Parser'
|
52
52
|
]
|
53
53
|
|
54
54
|
|
data/spec/english_spec.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'spec_helper'
|
|
3
3
|
|
4
4
|
describe OpenNLP do
|
5
5
|
|
6
|
-
# Failing spec #1
|
7
6
|
context "the maximum entropy chunker is run after tokenization and POS tagging" do
|
8
7
|
it "should find the accurate chunks" do
|
9
8
|
|
@@ -15,28 +14,38 @@ describe OpenNLP do
|
|
15
14
|
tokens = tokenizer.tokenize(sent)
|
16
15
|
tags = tagger.tag(tokens)
|
17
16
|
|
18
|
-
chunks = chunker.chunk(tokens
|
19
|
-
|
20
|
-
|
17
|
+
chunks = chunker.chunk(tokens, tags)
|
18
|
+
|
19
|
+
chunks.to_a.should eql %w[B-NP I-NP B-PP B-NP I-NP B-VP I-VP B-PP B-NP I-NP O]
|
21
20
|
tokens.to_a.should eql %w[The death of the poet was kept from his poems .]
|
22
|
-
tags.should eql [
|
21
|
+
tags.to_a.should eql %w[DT NN IN DT NN VBD VBN IN PRP$ NNS .]
|
23
22
|
|
24
23
|
end
|
25
24
|
end
|
26
25
|
|
27
|
-
# Failing spec #2
|
28
26
|
context "the maximum entropy parser is run after tokenization" do
|
29
27
|
it "parses the text accurately" do
|
28
|
+
|
30
29
|
sent = "The death of the poet was kept from his poems."
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
30
|
+
parser = OpenNLP::Parser.new
|
31
|
+
parse = parser.parse(sent)
|
32
|
+
|
33
|
+
parse.get_text.should eql sent
|
34
|
+
|
35
|
+
parse.get_span.get_start.should eql 0
|
36
|
+
parse.get_span.get_end.should eql 46
|
37
|
+
parse.get_span.get_type.should eql nil # ?
|
38
|
+
parse.get_child_count.should eql 1
|
39
|
+
|
40
|
+
child = parse.get_children[0]
|
41
|
+
|
42
|
+
child.text.should eql "The death of the poet was kept from his poems."
|
43
|
+
child.get_child_count.should eql 3
|
44
|
+
child.get_head_index.should eql 5
|
45
|
+
|
46
|
+
child.get_head.get_child_count.should eql 1
|
47
|
+
child.get_type.should eql "S"
|
48
|
+
|
40
49
|
end
|
41
50
|
end
|
42
51
|
|
metadata
CHANGED
@@ -1,32 +1,48 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: open-nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Louis Mullie
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bind-it
|
16
|
-
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
17
18
|
requirements:
|
18
19
|
- - ! '>='
|
19
20
|
- !ruby/object:Gem::Version
|
20
21
|
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
21
25
|
none: false
|
22
|
-
requirement: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
27
|
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
27
33
|
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
28
39
|
prerelease: false
|
29
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
description: ! ' Ruby bindings to the OpenNLP tools, a Java machine learning toolkit
|
31
47
|
for natural language processing (NLP). '
|
32
48
|
email:
|
@@ -39,8 +55,13 @@ files:
|
|
39
55
|
- bin/opennlp-maxent-3.0.2-incubating.jar
|
40
56
|
- bin/opennlp-tools-1.5.2-incubating.jar
|
41
57
|
- bin/opennlp-uima-1.5.2-incubating.jar
|
42
|
-
-
|
58
|
+
- bin/utils.jar
|
59
|
+
- bin/Utils.java
|
60
|
+
- lib/open-nlp/base.rb
|
61
|
+
- lib/open-nlp/bindings.rb
|
62
|
+
- lib/open-nlp/classes.rb
|
43
63
|
- lib/open-nlp/config.rb
|
64
|
+
- lib/open-nlp.rb
|
44
65
|
- spec/english_spec.rb
|
45
66
|
- spec/sample.txt
|
46
67
|
- spec/spec_helper.rb
|
@@ -48,27 +69,26 @@ files:
|
|
48
69
|
- LICENSE
|
49
70
|
homepage: https://github.com/louismullie/open-nlp
|
50
71
|
licenses: []
|
51
|
-
post_install_message:
|
72
|
+
post_install_message:
|
52
73
|
rdoc_options: []
|
53
74
|
require_paths:
|
54
75
|
- lib
|
55
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
56
78
|
requirements:
|
57
79
|
- - ! '>='
|
58
80
|
- !ruby/object:Gem::Version
|
59
81
|
version: '0'
|
60
|
-
none: false
|
61
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
62
84
|
requirements:
|
63
85
|
- - ! '>='
|
64
86
|
- !ruby/object:Gem::Version
|
65
87
|
version: '0'
|
66
|
-
none: false
|
67
88
|
requirements: []
|
68
|
-
rubyforge_project:
|
89
|
+
rubyforge_project:
|
69
90
|
rubygems_version: 1.8.24
|
70
|
-
signing_key:
|
91
|
+
signing_key:
|
71
92
|
specification_version: 3
|
72
93
|
summary: Ruby bindings to the OpenNLP Java toolkit.
|
73
94
|
test_files: []
|
74
|
-
...
|