open_nlp 0.0.2-java → 0.0.3-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/README.md +28 -4
- data/lib/open_nlp/chunker.rb +4 -0
- data/lib/open_nlp/java_class.rb +17 -0
- data/lib/open_nlp/model/chunker.rb +1 -1
- data/lib/open_nlp/model/detokenizer.rb +1 -1
- data/lib/open_nlp/model/named_entity_detector.rb +1 -1
- data/lib/open_nlp/model/parser.rb +3 -0
- data/lib/open_nlp/model/pos_tagger.rb +1 -1
- data/lib/open_nlp/model/sentence_detector.rb +1 -1
- data/lib/open_nlp/model/tokenizer.rb +1 -1
- data/lib/open_nlp/model.rb +3 -11
- data/lib/open_nlp/parser/parse.rb +54 -0
- data/lib/open_nlp/parser.rb +101 -0
- data/lib/open_nlp/tokenizer.rb +5 -0
- data/lib/open_nlp/tool.rb +2 -10
- data/lib/open_nlp/utils/span.rb +15 -0
- data/lib/open_nlp/version.rb +1 -1
- data/lib/open_nlp.rb +7 -1
- data/open_nlp.gemspec +1 -0
- data/spec/chunker_spec.rb +8 -0
- data/spec/fixtures/en-parser-chunking.bin +0 -0
- data/spec/model/chunker_spec.rb +2 -2
- data/spec/model/detokenizer_spec.rb +2 -2
- data/spec/model/named_entity_detector_spec.rb +2 -2
- data/spec/model/pos_tagger_spec.rb +2 -2
- data/spec/model/sentence_detector_spec.rb +2 -2
- data/spec/model/tokenizer_spec.rb +2 -2
- data/spec/parser/parse_spec.rb +106 -0
- data/spec/parser_spec.rb +39 -0
- metadata +15 -3
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/README.md
CHANGED
@@ -6,6 +6,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
|
|
6
6
|
* part-of-speech tagging
|
7
7
|
* named entity extraction
|
8
8
|
* chunks detection
|
9
|
+
* parsing
|
9
10
|
|
10
11
|
## Installation
|
11
12
|
|
@@ -29,17 +30,20 @@ To use open_nlp classes, you need to require it in your sources
|
|
29
30
|
|
30
31
|
Then you can create instances of open_nlp classes and use it for your nlp tasks
|
31
32
|
|
32
|
-
|
33
|
+
### Sentence detection
|
34
|
+
|
33
35
|
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
34
36
|
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
35
37
|
sentence_detector.detect('The red fox sleeps soundly.')
|
36
38
|
|
37
|
-
|
39
|
+
### Tokenize
|
40
|
+
|
38
41
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
39
42
|
tokenizer = OpenNlp::Tokenizer.new(token_model)
|
40
43
|
tokenizer.tokenize('The red fox sleeps soundly.')
|
41
44
|
|
42
|
-
|
45
|
+
### Part-of-speech tagging
|
46
|
+
|
43
47
|
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
44
48
|
pos_tagger = OpenNlp::POSTagger.new(pos_model)
|
45
49
|
|
@@ -49,11 +53,31 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
49
53
|
# to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
|
50
54
|
pos_tagger.tag(%w|The red fox sleeps soundly .|)
|
51
55
|
|
52
|
-
|
56
|
+
### Chunks detection
|
57
|
+
|
58
|
+
# chunker also needs tokenizer and pos-tagger models
|
59
|
+
# because it uses tokenizing and pos-tagging inside chunk task
|
53
60
|
chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
|
54
61
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
55
62
|
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
56
63
|
chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
|
64
|
+
chunker.chunk('The red fox sleeps soundly.')
|
65
|
+
|
66
|
+
### Parsing
|
67
|
+
|
68
|
+
# parser also needs tokenizer model because it uses tokenizer inside parse task
|
69
|
+
parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
|
70
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
71
|
+
parser = OpenNlp::Parser.new(parse_model, token_model)
|
72
|
+
|
73
|
+
# the result will be an instance of OpenNlp::Parser::Parse
|
74
|
+
parse_info = parser.parse('The red fox sleeps soundly.')
|
75
|
+
|
76
|
+
# you can get tree bank string by calling
|
77
|
+
parse_info.tree_bank_string
|
78
|
+
|
79
|
+
# you can get code tree structure of parse result by calling
|
80
|
+
parse_info.code_tree
|
57
81
|
|
58
82
|
## Contributing
|
59
83
|
|
data/lib/open_nlp/chunker.rb
CHANGED
data/lib/open_nlp/model.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Model
|
3
|
+
include JavaClass
|
4
|
+
|
3
5
|
attr_reader :j_model
|
4
6
|
|
5
7
|
def initialize(model)
|
@@ -12,17 +14,7 @@ module OpenNlp
|
|
12
14
|
raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
|
13
15
|
end
|
14
16
|
|
15
|
-
@j_model = self.class.
|
16
|
-
end
|
17
|
-
|
18
|
-
class << self
|
19
|
-
def java_class_name=(value)
|
20
|
-
@java_class = value
|
21
|
-
end
|
22
|
-
|
23
|
-
def java_class_name
|
24
|
-
@java_class
|
25
|
-
end
|
17
|
+
@j_model = self.class.java_class.new(model_stream)
|
26
18
|
end
|
27
19
|
end
|
28
20
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Parser::Parse
|
3
|
+
include JavaClass
|
4
|
+
|
5
|
+
attr_reader :j_instance
|
6
|
+
|
7
|
+
self.java_class = Java::opennlp.tools.parser.Parse
|
8
|
+
|
9
|
+
def initialize(java_instance)
|
10
|
+
raise ArgumentError, "java_instance must be an instance of #{self.class.java_class.name}" unless java_instance.is_a?(self.class.java_class)
|
11
|
+
|
12
|
+
@j_instance = java_instance
|
13
|
+
end
|
14
|
+
|
15
|
+
def tree_bank_string
|
16
|
+
span = j_instance.getSpan
|
17
|
+
text = j_instance.getText
|
18
|
+
type = j_instance.getType
|
19
|
+
start = span.getStart
|
20
|
+
|
21
|
+
res = ''
|
22
|
+
|
23
|
+
res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
24
|
+
|
25
|
+
j_instance.getChildren.each do |c|
|
26
|
+
s = c.span
|
27
|
+
res << text[start..s.getStart-1] if start < s.getStart
|
28
|
+
|
29
|
+
subtree = self.class.new(c).tree_bank_string
|
30
|
+
res << subtree if subtree
|
31
|
+
start = s.getEnd
|
32
|
+
end
|
33
|
+
|
34
|
+
res << text[start..span.getEnd-1] if start < span.getEnd
|
35
|
+
|
36
|
+
res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
37
|
+
|
38
|
+
res
|
39
|
+
end
|
40
|
+
|
41
|
+
def code_tree
|
42
|
+
kids = j_instance.getChildren
|
43
|
+
|
44
|
+
kids.inject([]) do |acc,kid|
|
45
|
+
data = {type: kid.getType, parent_type: self.j_instance.getType, token: kid.toString}
|
46
|
+
subtree = self.class.new(kid).code_tree
|
47
|
+
data[:children] = subtree unless subtree.empty?
|
48
|
+
acc << data
|
49
|
+
|
50
|
+
acc
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Parser < Tool
|
3
|
+
def initialize(model, token_model)
|
4
|
+
raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
|
5
|
+
raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
|
6
|
+
|
7
|
+
@j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
|
8
|
+
|
9
|
+
@tokenizer = Tokenizer.new(token_model)
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse(text)
|
13
|
+
raise ArgumentError, "str must be a String" unless text.is_a?(String)
|
14
|
+
return {} if text.empty?
|
15
|
+
|
16
|
+
parse_obj = Java::opennlp.tools.parser.Parse.new(
|
17
|
+
text.to_java(:String),
|
18
|
+
Java::opennlp.tools.util.Span.new(0, text.size),
|
19
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
20
|
+
1.to_java(:Double), # probability ?
|
21
|
+
0.to_java(:Integer) # the token index of the head of this parse
|
22
|
+
)
|
23
|
+
|
24
|
+
tokens = @tokenizer.tokenize(text)
|
25
|
+
|
26
|
+
tokens.each_with_index do |tok, i|
|
27
|
+
start = get_token_offset text, tokens, i
|
28
|
+
|
29
|
+
parse_obj.insert Java::opennlp.tools.parser.Parse.new(
|
30
|
+
text.to_java(:String),
|
31
|
+
Java::opennlp.tools.util.Span.new(start, start + tok.size),
|
32
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
|
33
|
+
0.to_java(:Double),
|
34
|
+
i.to_java(:Integer)
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
#code_tree @j_instance.parse(parse_obj)
|
39
|
+
Parser::Parse.new(@j_instance.parse(parse_obj))
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def get_token_offset(text, tokens, index)
|
44
|
+
offset = 0
|
45
|
+
|
46
|
+
for i in (1..index) do
|
47
|
+
offset = text.index tokens[i], offset + tokens[i - 1].size
|
48
|
+
end if index > 0
|
49
|
+
|
50
|
+
offset
|
51
|
+
end
|
52
|
+
|
53
|
+
#def build_tree(parse_obj)
|
54
|
+
# span = parse_obj.getSpan
|
55
|
+
# start = span.getStart
|
56
|
+
# text = parse_obj.getText
|
57
|
+
# type = parse_obj.getType
|
58
|
+
#
|
59
|
+
# res = {}
|
60
|
+
# res[:type] = type unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
61
|
+
#
|
62
|
+
# children = parse_obj.getChildren.inject([]) do |acc,c|
|
63
|
+
# s = c.span
|
64
|
+
#
|
65
|
+
# h = {}
|
66
|
+
#
|
67
|
+
# if start < s.getStart
|
68
|
+
# token = text[start..s.getStart-1]
|
69
|
+
# h[:token] = token unless token.strip.empty?
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# subtree = build_tree(c)
|
73
|
+
# h[:children] = subtree unless subtree.empty?
|
74
|
+
#
|
75
|
+
# start = s.getEnd
|
76
|
+
#
|
77
|
+
# acc << h
|
78
|
+
# acc
|
79
|
+
# end
|
80
|
+
#
|
81
|
+
# res[:token] = text[start..span.getEnd-1] if start < span.getEnd
|
82
|
+
#
|
83
|
+
# res[:children] = children unless children.empty?
|
84
|
+
#
|
85
|
+
# res
|
86
|
+
#end
|
87
|
+
|
88
|
+
def code_tree(parse_obj)
|
89
|
+
kids = parse_obj.getChildren
|
90
|
+
|
91
|
+
kids.inject([]) do |acc,kid|
|
92
|
+
data = {type: kid.getType, parent_type: parse_obj.getType, token: kid.toString}
|
93
|
+
subtree = code_tree(kid)
|
94
|
+
data[:children] = subtree unless subtree.empty?
|
95
|
+
acc << data
|
96
|
+
|
97
|
+
acc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
data/lib/open_nlp/tokenizer.rb
CHANGED
data/lib/open_nlp/tool.rb
CHANGED
@@ -1,20 +1,12 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Tool
|
3
|
+
include JavaClass
|
4
|
+
|
3
5
|
attr_reader :j_instance
|
4
6
|
|
5
7
|
def initialize(model)
|
6
8
|
raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
|
7
9
|
@j_instance = self.class.java_class.new(model.j_model)
|
8
10
|
end
|
9
|
-
|
10
|
-
class << self
|
11
|
-
def java_class=(value)
|
12
|
-
@java_class = value
|
13
|
-
end
|
14
|
-
|
15
|
-
def java_class
|
16
|
-
@java_class
|
17
|
-
end
|
18
|
-
end
|
19
11
|
end
|
20
12
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
module Utils
|
3
|
+
class Span
|
4
|
+
include JavaClass
|
5
|
+
|
6
|
+
self.java_class = Java::opennlp.tools.util.Span
|
7
|
+
|
8
|
+
attr_reader :j_instance
|
9
|
+
|
10
|
+
def initialize(start_offset, end_offset)
|
11
|
+
@j_instance = self.class.java_class.new(start_offset, end_offset)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/open_nlp/version.rb
CHANGED
data/lib/open_nlp.rb
CHANGED
@@ -4,17 +4,23 @@ require 'java'
|
|
4
4
|
require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
|
5
5
|
require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
|
6
6
|
|
7
|
+
require 'open_nlp/java_class'
|
8
|
+
|
7
9
|
require 'open_nlp/model'
|
10
|
+
|
8
11
|
require 'open_nlp/model/chunker'
|
9
12
|
require 'open_nlp/model/detokenizer'
|
10
13
|
require 'open_nlp/model/named_entity_detector'
|
11
14
|
require 'open_nlp/model/pos_tagger'
|
12
15
|
require 'open_nlp/model/sentence_detector'
|
13
16
|
require 'open_nlp/model/tokenizer'
|
17
|
+
require 'open_nlp/model/parser'
|
14
18
|
|
15
19
|
require 'open_nlp/tool'
|
16
20
|
require 'open_nlp/named_entity_detector'
|
17
21
|
require 'open_nlp/pos_tagger'
|
18
22
|
require 'open_nlp/sentence_detector'
|
19
23
|
require 'open_nlp/tokenizer'
|
20
|
-
require 'open_nlp/chunker'
|
24
|
+
require 'open_nlp/chunker'
|
25
|
+
require 'open_nlp/parser'
|
26
|
+
require 'open_nlp/parser/parse'
|
data/open_nlp.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |gem|
|
|
9
9
|
gem.authors = ["Hck"]
|
10
10
|
gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
|
11
11
|
gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
|
12
|
+
gem.homepage = "http://github.com/hck/open_nlp"
|
12
13
|
|
13
14
|
gem.files = `git ls-files`.split($/)
|
14
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
data/spec/chunker_spec.rb
CHANGED
@@ -16,6 +16,14 @@ describe OpenNlp::Chunker do
|
|
16
16
|
it "should raise an argument error when no model is supplied" do
|
17
17
|
lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
|
18
18
|
end
|
19
|
+
|
20
|
+
it "should raise an argument error when no token_model is supplied" do
|
21
|
+
lambda { subject.new(model, nil, nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should raise an argument error when no pos_model is supplied" do
|
25
|
+
lambda { subject.new(model, token_model, nil) }.should raise_error(ArgumentError)
|
26
|
+
end
|
19
27
|
end
|
20
28
|
|
21
29
|
describe "chunking a string" do
|
Binary file
|
data/spec/model/chunker_spec.rb
CHANGED
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Chunker do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
chunker_model = subject.new(model_file_name)
|
9
9
|
chunker_model.should be_a(subject)
|
10
|
-
chunker_model.j_model.should be_a(subject.
|
10
|
+
chunker_model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
chunker_model = subject.new(file_input_stream)
|
16
16
|
chunker_model.should be_a(subject)
|
17
|
-
chunker_model.j_model.should be_a(subject.
|
17
|
+
chunker_model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Detokenizer do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::NamedEntityDetector do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::POSTagger do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::SentenceDetector do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Tokenizer do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Parser::Parse do
|
4
|
+
subject { OpenNlp::Parser::Parse }
|
5
|
+
let(:text) { 'The red fox sleeps soundly .' }
|
6
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
7
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
8
|
+
|
9
|
+
describe "initialization" do
|
10
|
+
it "should initialize a new parse object" do
|
11
|
+
j_parse = Java::opennlp.tools.parser.Parse.new(
|
12
|
+
text.to_java(:String),
|
13
|
+
Java::opennlp.tools.util.Span.new(0, text.size),
|
14
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
15
|
+
1.to_java(:Double),
|
16
|
+
0.to_java(:Integer)
|
17
|
+
)
|
18
|
+
|
19
|
+
subject.new(j_parse).should be_a(subject)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should raise an argument error when no model is supplied" do
|
23
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#tree_bank_string" do
|
28
|
+
it "returns proper string value for parsed text" do
|
29
|
+
parser = OpenNlp::Parser.new(model, token_model)
|
30
|
+
expected = parser.parse(text).tree_bank_string
|
31
|
+
expected.should == "(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#code_tree" do
|
36
|
+
it "returns proper structure for parsed text" do
|
37
|
+
parser = OpenNlp::Parser.new(model, token_model)
|
38
|
+
parser.parse(text).code_tree.should == [
|
39
|
+
{
|
40
|
+
:type => "S",
|
41
|
+
:parent_type => "TOP",
|
42
|
+
:token => "The red fox sleeps soundly .",
|
43
|
+
:children => [
|
44
|
+
{
|
45
|
+
:type => "NP",
|
46
|
+
:parent_type => "S",
|
47
|
+
:token => "The red fox",
|
48
|
+
:children => [
|
49
|
+
{
|
50
|
+
:type => "DT",
|
51
|
+
:parent_type => "NP",
|
52
|
+
:token => "The",
|
53
|
+
:children => [{:type => "TK", :parent_type => "DT", :token => "The"}]
|
54
|
+
},
|
55
|
+
{
|
56
|
+
:type => "JJ",
|
57
|
+
:parent_type => "NP",
|
58
|
+
:token => "red",
|
59
|
+
:children => [{:type => "TK", :parent_type => "JJ", :token => "red"}]
|
60
|
+
},
|
61
|
+
{
|
62
|
+
:type => "NN",
|
63
|
+
:parent_type => "NP",
|
64
|
+
:token => "fox",
|
65
|
+
:children => [{:type => "TK", :parent_type => "NN", :token => "fox"}]
|
66
|
+
}
|
67
|
+
]
|
68
|
+
},
|
69
|
+
{
|
70
|
+
:type => "VP",
|
71
|
+
:parent_type => "S",
|
72
|
+
:token => "sleeps soundly",
|
73
|
+
:children => [
|
74
|
+
{
|
75
|
+
:type => "VBZ",
|
76
|
+
:parent_type => "VP",
|
77
|
+
:token => "sleeps",
|
78
|
+
:children => [{:type => "TK", :parent_type => "VBZ", :token => "sleeps"}]
|
79
|
+
},
|
80
|
+
{
|
81
|
+
:type => "ADVP",
|
82
|
+
:parent_type => "VP",
|
83
|
+
:token => "soundly",
|
84
|
+
:children => [
|
85
|
+
{
|
86
|
+
:type => "RB",
|
87
|
+
:parent_type => "ADVP",
|
88
|
+
:token => "soundly",
|
89
|
+
:children => [{:type => "TK", :parent_type => "RB", :token => "soundly"}]
|
90
|
+
}
|
91
|
+
]
|
92
|
+
}
|
93
|
+
]
|
94
|
+
},
|
95
|
+
{
|
96
|
+
:type => ".",
|
97
|
+
:parent_type => "S",
|
98
|
+
:token => ".",
|
99
|
+
:children => [{:type => "TK", :parent_type => ".", :token => "."}]
|
100
|
+
}
|
101
|
+
]
|
102
|
+
}
|
103
|
+
]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Parser do
|
4
|
+
subject { OpenNlp::Parser }
|
5
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
6
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize a new parser" do
|
10
|
+
parser = subject.new(model, token_model)
|
11
|
+
parser.should be_a(subject)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an argument error when no model is supplied" do
|
15
|
+
lambda { subject.new(nil, nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should raise an argument error when no token_model is supplied" do
|
19
|
+
lambda { subject.new(model, nil) }.should raise_error(ArgumentError)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "parsing a string" do
|
24
|
+
let(:parser) { subject.new(model, token_model) }
|
25
|
+
|
26
|
+
it "should parse an empty string" do
|
27
|
+
parser.parse("").should == {}
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should parse a sentence" do
|
31
|
+
res = parser.parse("The red fox sleeps soundly .")
|
32
|
+
res.class.should == OpenNlp::Parser::Parse
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should raise an error when not passed a string" do
|
36
|
+
lambda { parser.parse(nil) }.should raise_error(ArgumentError)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-26 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: JRuby tools wrapper for Apache OpenNLP
|
15
15
|
email:
|
@@ -18,32 +18,39 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- .gitignore
|
21
|
+
- .rspec
|
21
22
|
- Gemfile
|
22
23
|
- LICENSE.txt
|
23
24
|
- README.md
|
24
25
|
- Rakefile
|
25
26
|
- lib/open_nlp.rb
|
26
27
|
- lib/open_nlp/chunker.rb
|
28
|
+
- lib/open_nlp/java_class.rb
|
27
29
|
- lib/open_nlp/model.rb
|
28
30
|
- lib/open_nlp/model/chunker.rb
|
29
31
|
- lib/open_nlp/model/detokenizer.rb
|
30
32
|
- lib/open_nlp/model/named_entity_detector.rb
|
33
|
+
- lib/open_nlp/model/parser.rb
|
31
34
|
- lib/open_nlp/model/pos_tagger.rb
|
32
35
|
- lib/open_nlp/model/sentence_detector.rb
|
33
36
|
- lib/open_nlp/model/tokenizer.rb
|
34
37
|
- lib/open_nlp/named_entity_detector.rb
|
35
38
|
- lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
|
36
39
|
- lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
|
40
|
+
- lib/open_nlp/parser.rb
|
41
|
+
- lib/open_nlp/parser/parse.rb
|
37
42
|
- lib/open_nlp/pos_tagger.rb
|
38
43
|
- lib/open_nlp/sentence_detector.rb
|
39
44
|
- lib/open_nlp/tokenizer.rb
|
40
45
|
- lib/open_nlp/tool.rb
|
46
|
+
- lib/open_nlp/utils/span.rb
|
41
47
|
- lib/open_nlp/version.rb
|
42
48
|
- open_nlp.gemspec
|
43
49
|
- spec/chunker_spec.rb
|
44
50
|
- spec/fixtures/en-chunker.bin
|
45
51
|
- spec/fixtures/en-detokenizer.xml
|
46
52
|
- spec/fixtures/en-ner-time.bin
|
53
|
+
- spec/fixtures/en-parser-chunking.bin
|
47
54
|
- spec/fixtures/en-pos-maxent.bin
|
48
55
|
- spec/fixtures/en-sent.bin
|
49
56
|
- spec/fixtures/en-token.bin
|
@@ -54,11 +61,13 @@ files:
|
|
54
61
|
- spec/model/sentence_detector_spec.rb
|
55
62
|
- spec/model/tokenizer_spec.rb
|
56
63
|
- spec/named_entity_detector_spec.rb
|
64
|
+
- spec/parser/parse_spec.rb
|
65
|
+
- spec/parser_spec.rb
|
57
66
|
- spec/pos_tagger_spec.rb
|
58
67
|
- spec/sentence_detector_spec.rb
|
59
68
|
- spec/spec_helper.rb
|
60
69
|
- spec/tokenizer_spec.rb
|
61
|
-
homepage:
|
70
|
+
homepage: http://github.com/hck/open_nlp
|
62
71
|
licenses: []
|
63
72
|
post_install_message:
|
64
73
|
rdoc_options: []
|
@@ -89,6 +98,7 @@ test_files:
|
|
89
98
|
- spec/fixtures/en-chunker.bin
|
90
99
|
- spec/fixtures/en-detokenizer.xml
|
91
100
|
- spec/fixtures/en-ner-time.bin
|
101
|
+
- spec/fixtures/en-parser-chunking.bin
|
92
102
|
- spec/fixtures/en-pos-maxent.bin
|
93
103
|
- spec/fixtures/en-sent.bin
|
94
104
|
- spec/fixtures/en-token.bin
|
@@ -99,6 +109,8 @@ test_files:
|
|
99
109
|
- spec/model/sentence_detector_spec.rb
|
100
110
|
- spec/model/tokenizer_spec.rb
|
101
111
|
- spec/named_entity_detector_spec.rb
|
112
|
+
- spec/parser/parse_spec.rb
|
113
|
+
- spec/parser_spec.rb
|
102
114
|
- spec/pos_tagger_spec.rb
|
103
115
|
- spec/sentence_detector_spec.rb
|
104
116
|
- spec/spec_helper.rb
|