open_nlp 0.0.2-java → 0.0.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/README.md +28 -4
- data/lib/open_nlp/chunker.rb +4 -0
- data/lib/open_nlp/java_class.rb +17 -0
- data/lib/open_nlp/model/chunker.rb +1 -1
- data/lib/open_nlp/model/detokenizer.rb +1 -1
- data/lib/open_nlp/model/named_entity_detector.rb +1 -1
- data/lib/open_nlp/model/parser.rb +3 -0
- data/lib/open_nlp/model/pos_tagger.rb +1 -1
- data/lib/open_nlp/model/sentence_detector.rb +1 -1
- data/lib/open_nlp/model/tokenizer.rb +1 -1
- data/lib/open_nlp/model.rb +3 -11
- data/lib/open_nlp/parser/parse.rb +54 -0
- data/lib/open_nlp/parser.rb +101 -0
- data/lib/open_nlp/tokenizer.rb +5 -0
- data/lib/open_nlp/tool.rb +2 -10
- data/lib/open_nlp/utils/span.rb +15 -0
- data/lib/open_nlp/version.rb +1 -1
- data/lib/open_nlp.rb +7 -1
- data/open_nlp.gemspec +1 -0
- data/spec/chunker_spec.rb +8 -0
- data/spec/fixtures/en-parser-chunking.bin +0 -0
- data/spec/model/chunker_spec.rb +2 -2
- data/spec/model/detokenizer_spec.rb +2 -2
- data/spec/model/named_entity_detector_spec.rb +2 -2
- data/spec/model/pos_tagger_spec.rb +2 -2
- data/spec/model/sentence_detector_spec.rb +2 -2
- data/spec/model/tokenizer_spec.rb +2 -2
- data/spec/parser/parse_spec.rb +106 -0
- data/spec/parser_spec.rb +39 -0
- metadata +15 -3
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/README.md
CHANGED
@@ -6,6 +6,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
|
|
6
6
|
* part-of-speech tagging
|
7
7
|
* named entity extraction
|
8
8
|
* chunks detection
|
9
|
+
* parsing
|
9
10
|
|
10
11
|
## Installation
|
11
12
|
|
@@ -29,17 +30,20 @@ To use open_nlp classes, you need to require it in your sources
|
|
29
30
|
|
30
31
|
Then you can create instances of open_nlp classes and use it for your nlp tasks
|
31
32
|
|
32
|
-
|
33
|
+
### Sentence detection
|
34
|
+
|
33
35
|
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
34
36
|
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
35
37
|
sentence_detector.detect('The red fox sleeps soundly.')
|
36
38
|
|
37
|
-
|
39
|
+
### Tokenize
|
40
|
+
|
38
41
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
39
42
|
tokenizer = OpenNlp::Tokenizer.new(token_model)
|
40
43
|
tokenizer.tokenize('The red fox sleeps soundly.')
|
41
44
|
|
42
|
-
|
45
|
+
### Part-of-speech tagging
|
46
|
+
|
43
47
|
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
44
48
|
pos_tagger = OpenNlp::POSTagger.new(pos_model)
|
45
49
|
|
@@ -49,11 +53,31 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
49
53
|
# to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
|
50
54
|
pos_tagger.tag(%w|The red fox sleeps soundly .|)
|
51
55
|
|
52
|
-
|
56
|
+
### Chunks detection
|
57
|
+
|
58
|
+
# chunker also needs tokenizer and pos-tagger models
|
59
|
+
# because it uses tokenizing and pos-tagging inside chunk task
|
53
60
|
chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
|
54
61
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
55
62
|
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
56
63
|
chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
|
64
|
+
chunker.chunk('The red fox sleeps soundly.')
|
65
|
+
|
66
|
+
### Parsing
|
67
|
+
|
68
|
+
# parser also needs tokenizer model because it uses tokenizer inside parse task
|
69
|
+
parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
|
70
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
71
|
+
parser = OpenNlp::Parser.new(parse_model, token_model)
|
72
|
+
|
73
|
+
# the result will be an instance of OpenNlp::Parser::Parse
|
74
|
+
parse_info = parser.parse('The red fox sleeps soundly.')
|
75
|
+
|
76
|
+
# you can get tree bank string by calling
|
77
|
+
parse_info.tree_bank_string
|
78
|
+
|
79
|
+
# you can get code tree structure of parse result by calling
|
80
|
+
parse_info.code_tree
|
57
81
|
|
58
82
|
## Contributing
|
59
83
|
|
data/lib/open_nlp/chunker.rb
CHANGED
data/lib/open_nlp/model.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Model
|
3
|
+
include JavaClass
|
4
|
+
|
3
5
|
attr_reader :j_model
|
4
6
|
|
5
7
|
def initialize(model)
|
@@ -12,17 +14,7 @@ module OpenNlp
|
|
12
14
|
raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
|
13
15
|
end
|
14
16
|
|
15
|
-
@j_model = self.class.
|
16
|
-
end
|
17
|
-
|
18
|
-
class << self
|
19
|
-
def java_class_name=(value)
|
20
|
-
@java_class = value
|
21
|
-
end
|
22
|
-
|
23
|
-
def java_class_name
|
24
|
-
@java_class
|
25
|
-
end
|
17
|
+
@j_model = self.class.java_class.new(model_stream)
|
26
18
|
end
|
27
19
|
end
|
28
20
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Parser::Parse
|
3
|
+
include JavaClass
|
4
|
+
|
5
|
+
attr_reader :j_instance
|
6
|
+
|
7
|
+
self.java_class = Java::opennlp.tools.parser.Parse
|
8
|
+
|
9
|
+
def initialize(java_instance)
|
10
|
+
raise ArgumentError, "java_instance must be an instance of #{self.class.java_class.name}" unless java_instance.is_a?(self.class.java_class)
|
11
|
+
|
12
|
+
@j_instance = java_instance
|
13
|
+
end
|
14
|
+
|
15
|
+
def tree_bank_string
|
16
|
+
span = j_instance.getSpan
|
17
|
+
text = j_instance.getText
|
18
|
+
type = j_instance.getType
|
19
|
+
start = span.getStart
|
20
|
+
|
21
|
+
res = ''
|
22
|
+
|
23
|
+
res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
24
|
+
|
25
|
+
j_instance.getChildren.each do |c|
|
26
|
+
s = c.span
|
27
|
+
res << text[start..s.getStart-1] if start < s.getStart
|
28
|
+
|
29
|
+
subtree = self.class.new(c).tree_bank_string
|
30
|
+
res << subtree if subtree
|
31
|
+
start = s.getEnd
|
32
|
+
end
|
33
|
+
|
34
|
+
res << text[start..span.getEnd-1] if start < span.getEnd
|
35
|
+
|
36
|
+
res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
37
|
+
|
38
|
+
res
|
39
|
+
end
|
40
|
+
|
41
|
+
def code_tree
|
42
|
+
kids = j_instance.getChildren
|
43
|
+
|
44
|
+
kids.inject([]) do |acc,kid|
|
45
|
+
data = {type: kid.getType, parent_type: self.j_instance.getType, token: kid.toString}
|
46
|
+
subtree = self.class.new(kid).code_tree
|
47
|
+
data[:children] = subtree unless subtree.empty?
|
48
|
+
acc << data
|
49
|
+
|
50
|
+
acc
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Parser < Tool
|
3
|
+
def initialize(model, token_model)
|
4
|
+
raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
|
5
|
+
raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
|
6
|
+
|
7
|
+
@j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
|
8
|
+
|
9
|
+
@tokenizer = Tokenizer.new(token_model)
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse(text)
|
13
|
+
raise ArgumentError, "str must be a String" unless text.is_a?(String)
|
14
|
+
return {} if text.empty?
|
15
|
+
|
16
|
+
parse_obj = Java::opennlp.tools.parser.Parse.new(
|
17
|
+
text.to_java(:String),
|
18
|
+
Java::opennlp.tools.util.Span.new(0, text.size),
|
19
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
20
|
+
1.to_java(:Double), # probability ?
|
21
|
+
0.to_java(:Integer) # the token index of the head of this parse
|
22
|
+
)
|
23
|
+
|
24
|
+
tokens = @tokenizer.tokenize(text)
|
25
|
+
|
26
|
+
tokens.each_with_index do |tok, i|
|
27
|
+
start = get_token_offset text, tokens, i
|
28
|
+
|
29
|
+
parse_obj.insert Java::opennlp.tools.parser.Parse.new(
|
30
|
+
text.to_java(:String),
|
31
|
+
Java::opennlp.tools.util.Span.new(start, start + tok.size),
|
32
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
|
33
|
+
0.to_java(:Double),
|
34
|
+
i.to_java(:Integer)
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
#code_tree @j_instance.parse(parse_obj)
|
39
|
+
Parser::Parse.new(@j_instance.parse(parse_obj))
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def get_token_offset(text, tokens, index)
|
44
|
+
offset = 0
|
45
|
+
|
46
|
+
for i in (1..index) do
|
47
|
+
offset = text.index tokens[i], offset + tokens[i - 1].size
|
48
|
+
end if index > 0
|
49
|
+
|
50
|
+
offset
|
51
|
+
end
|
52
|
+
|
53
|
+
#def build_tree(parse_obj)
|
54
|
+
# span = parse_obj.getSpan
|
55
|
+
# start = span.getStart
|
56
|
+
# text = parse_obj.getText
|
57
|
+
# type = parse_obj.getType
|
58
|
+
#
|
59
|
+
# res = {}
|
60
|
+
# res[:type] = type unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
61
|
+
#
|
62
|
+
# children = parse_obj.getChildren.inject([]) do |acc,c|
|
63
|
+
# s = c.span
|
64
|
+
#
|
65
|
+
# h = {}
|
66
|
+
#
|
67
|
+
# if start < s.getStart
|
68
|
+
# token = text[start..s.getStart-1]
|
69
|
+
# h[:token] = token unless token.strip.empty?
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# subtree = build_tree(c)
|
73
|
+
# h[:children] = subtree unless subtree.empty?
|
74
|
+
#
|
75
|
+
# start = s.getEnd
|
76
|
+
#
|
77
|
+
# acc << h
|
78
|
+
# acc
|
79
|
+
# end
|
80
|
+
#
|
81
|
+
# res[:token] = text[start..span.getEnd-1] if start < span.getEnd
|
82
|
+
#
|
83
|
+
# res[:children] = children unless children.empty?
|
84
|
+
#
|
85
|
+
# res
|
86
|
+
#end
|
87
|
+
|
88
|
+
def code_tree(parse_obj)
|
89
|
+
kids = parse_obj.getChildren
|
90
|
+
|
91
|
+
kids.inject([]) do |acc,kid|
|
92
|
+
data = {type: kid.getType, parent_type: parse_obj.getType, token: kid.toString}
|
93
|
+
subtree = code_tree(kid)
|
94
|
+
data[:children] = subtree unless subtree.empty?
|
95
|
+
acc << data
|
96
|
+
|
97
|
+
acc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
data/lib/open_nlp/tokenizer.rb
CHANGED
data/lib/open_nlp/tool.rb
CHANGED
@@ -1,20 +1,12 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Tool
|
3
|
+
include JavaClass
|
4
|
+
|
3
5
|
attr_reader :j_instance
|
4
6
|
|
5
7
|
def initialize(model)
|
6
8
|
raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
|
7
9
|
@j_instance = self.class.java_class.new(model.j_model)
|
8
10
|
end
|
9
|
-
|
10
|
-
class << self
|
11
|
-
def java_class=(value)
|
12
|
-
@java_class = value
|
13
|
-
end
|
14
|
-
|
15
|
-
def java_class
|
16
|
-
@java_class
|
17
|
-
end
|
18
|
-
end
|
19
11
|
end
|
20
12
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
module Utils
|
3
|
+
class Span
|
4
|
+
include JavaClass
|
5
|
+
|
6
|
+
self.java_class = Java::opennlp.tools.util.Span
|
7
|
+
|
8
|
+
attr_reader :j_instance
|
9
|
+
|
10
|
+
def initialize(start_offset, end_offset)
|
11
|
+
@j_instance = self.class.java_class.new(start_offset, end_offset)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/open_nlp/version.rb
CHANGED
data/lib/open_nlp.rb
CHANGED
@@ -4,17 +4,23 @@ require 'java'
|
|
4
4
|
require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
|
5
5
|
require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
|
6
6
|
|
7
|
+
require 'open_nlp/java_class'
|
8
|
+
|
7
9
|
require 'open_nlp/model'
|
10
|
+
|
8
11
|
require 'open_nlp/model/chunker'
|
9
12
|
require 'open_nlp/model/detokenizer'
|
10
13
|
require 'open_nlp/model/named_entity_detector'
|
11
14
|
require 'open_nlp/model/pos_tagger'
|
12
15
|
require 'open_nlp/model/sentence_detector'
|
13
16
|
require 'open_nlp/model/tokenizer'
|
17
|
+
require 'open_nlp/model/parser'
|
14
18
|
|
15
19
|
require 'open_nlp/tool'
|
16
20
|
require 'open_nlp/named_entity_detector'
|
17
21
|
require 'open_nlp/pos_tagger'
|
18
22
|
require 'open_nlp/sentence_detector'
|
19
23
|
require 'open_nlp/tokenizer'
|
20
|
-
require 'open_nlp/chunker'
|
24
|
+
require 'open_nlp/chunker'
|
25
|
+
require 'open_nlp/parser'
|
26
|
+
require 'open_nlp/parser/parse'
|
data/open_nlp.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |gem|
|
|
9
9
|
gem.authors = ["Hck"]
|
10
10
|
gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
|
11
11
|
gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
|
12
|
+
gem.homepage = "http://github.com/hck/open_nlp"
|
12
13
|
|
13
14
|
gem.files = `git ls-files`.split($/)
|
14
15
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
data/spec/chunker_spec.rb
CHANGED
@@ -16,6 +16,14 @@ describe OpenNlp::Chunker do
|
|
16
16
|
it "should raise an argument error when no model is supplied" do
|
17
17
|
lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
|
18
18
|
end
|
19
|
+
|
20
|
+
it "should raise an argument error when no token_model is supplied" do
|
21
|
+
lambda { subject.new(model, nil, nil) }.should raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should raise an argument error when no pos_model is supplied" do
|
25
|
+
lambda { subject.new(model, token_model, nil) }.should raise_error(ArgumentError)
|
26
|
+
end
|
19
27
|
end
|
20
28
|
|
21
29
|
describe "chunking a string" do
|
Binary file
|
data/spec/model/chunker_spec.rb
CHANGED
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Chunker do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
chunker_model = subject.new(model_file_name)
|
9
9
|
chunker_model.should be_a(subject)
|
10
|
-
chunker_model.j_model.should be_a(subject.
|
10
|
+
chunker_model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
chunker_model = subject.new(file_input_stream)
|
16
16
|
chunker_model.should be_a(subject)
|
17
|
-
chunker_model.j_model.should be_a(subject.
|
17
|
+
chunker_model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Detokenizer do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::NamedEntityDetector do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::POSTagger do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::SentenceDetector do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Tokenizer do
|
|
7
7
|
it "should accept a string filename parameter" do
|
8
8
|
model = subject.new(model_file_name)
|
9
9
|
model.should be_a(subject)
|
10
|
-
model.j_model.should be_a(subject.
|
10
|
+
model.j_model.should be_a(subject.java_class)
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should accept a java.io.FileInputStream object" do
|
14
14
|
file_input_stream = java.io.FileInputStream.new(model_file_name)
|
15
15
|
model = subject.new(file_input_stream)
|
16
16
|
model.should be_a(subject)
|
17
|
-
model.j_model.should be_a(subject.
|
17
|
+
model.j_model.should be_a(subject.java_class)
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should raise an argument error otherwise" do
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Parser::Parse do
|
4
|
+
subject { OpenNlp::Parser::Parse }
|
5
|
+
let(:text) { 'The red fox sleeps soundly .' }
|
6
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
7
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
8
|
+
|
9
|
+
describe "initialization" do
|
10
|
+
it "should initialize a new parse object" do
|
11
|
+
j_parse = Java::opennlp.tools.parser.Parse.new(
|
12
|
+
text.to_java(:String),
|
13
|
+
Java::opennlp.tools.util.Span.new(0, text.size),
|
14
|
+
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
15
|
+
1.to_java(:Double),
|
16
|
+
0.to_java(:Integer)
|
17
|
+
)
|
18
|
+
|
19
|
+
subject.new(j_parse).should be_a(subject)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should raise an argument error when no model is supplied" do
|
23
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#tree_bank_string" do
|
28
|
+
it "returns proper string value for parsed text" do
|
29
|
+
parser = OpenNlp::Parser.new(model, token_model)
|
30
|
+
expected = parser.parse(text).tree_bank_string
|
31
|
+
expected.should == "(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#code_tree" do
|
36
|
+
it "returns proper structure for parsed text" do
|
37
|
+
parser = OpenNlp::Parser.new(model, token_model)
|
38
|
+
parser.parse(text).code_tree.should == [
|
39
|
+
{
|
40
|
+
:type => "S",
|
41
|
+
:parent_type => "TOP",
|
42
|
+
:token => "The red fox sleeps soundly .",
|
43
|
+
:children => [
|
44
|
+
{
|
45
|
+
:type => "NP",
|
46
|
+
:parent_type => "S",
|
47
|
+
:token => "The red fox",
|
48
|
+
:children => [
|
49
|
+
{
|
50
|
+
:type => "DT",
|
51
|
+
:parent_type => "NP",
|
52
|
+
:token => "The",
|
53
|
+
:children => [{:type => "TK", :parent_type => "DT", :token => "The"}]
|
54
|
+
},
|
55
|
+
{
|
56
|
+
:type => "JJ",
|
57
|
+
:parent_type => "NP",
|
58
|
+
:token => "red",
|
59
|
+
:children => [{:type => "TK", :parent_type => "JJ", :token => "red"}]
|
60
|
+
},
|
61
|
+
{
|
62
|
+
:type => "NN",
|
63
|
+
:parent_type => "NP",
|
64
|
+
:token => "fox",
|
65
|
+
:children => [{:type => "TK", :parent_type => "NN", :token => "fox"}]
|
66
|
+
}
|
67
|
+
]
|
68
|
+
},
|
69
|
+
{
|
70
|
+
:type => "VP",
|
71
|
+
:parent_type => "S",
|
72
|
+
:token => "sleeps soundly",
|
73
|
+
:children => [
|
74
|
+
{
|
75
|
+
:type => "VBZ",
|
76
|
+
:parent_type => "VP",
|
77
|
+
:token => "sleeps",
|
78
|
+
:children => [{:type => "TK", :parent_type => "VBZ", :token => "sleeps"}]
|
79
|
+
},
|
80
|
+
{
|
81
|
+
:type => "ADVP",
|
82
|
+
:parent_type => "VP",
|
83
|
+
:token => "soundly",
|
84
|
+
:children => [
|
85
|
+
{
|
86
|
+
:type => "RB",
|
87
|
+
:parent_type => "ADVP",
|
88
|
+
:token => "soundly",
|
89
|
+
:children => [{:type => "TK", :parent_type => "RB", :token => "soundly"}]
|
90
|
+
}
|
91
|
+
]
|
92
|
+
}
|
93
|
+
]
|
94
|
+
},
|
95
|
+
{
|
96
|
+
:type => ".",
|
97
|
+
:parent_type => "S",
|
98
|
+
:token => ".",
|
99
|
+
:children => [{:type => "TK", :parent_type => ".", :token => "."}]
|
100
|
+
}
|
101
|
+
]
|
102
|
+
}
|
103
|
+
]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe OpenNlp::Parser do
|
4
|
+
subject { OpenNlp::Parser }
|
5
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
|
6
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
|
7
|
+
|
8
|
+
describe "initialization" do
|
9
|
+
it "should initialize a new parser" do
|
10
|
+
parser = subject.new(model, token_model)
|
11
|
+
parser.should be_a(subject)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an argument error when no model is supplied" do
|
15
|
+
lambda { subject.new(nil, nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should raise an argument error when no token_model is supplied" do
|
19
|
+
lambda { subject.new(model, nil) }.should raise_error(ArgumentError)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "parsing a string" do
|
24
|
+
let(:parser) { subject.new(model, token_model) }
|
25
|
+
|
26
|
+
it "should parse an empty string" do
|
27
|
+
parser.parse("").should == {}
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should parse a sentence" do
|
31
|
+
res = parser.parse("The red fox sleeps soundly .")
|
32
|
+
res.class.should == OpenNlp::Parser::Parse
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should raise an error when not passed a string" do
|
36
|
+
lambda { parser.parse(nil) }.should raise_error(ArgumentError)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-26 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: JRuby tools wrapper for Apache OpenNLP
|
15
15
|
email:
|
@@ -18,32 +18,39 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- .gitignore
|
21
|
+
- .rspec
|
21
22
|
- Gemfile
|
22
23
|
- LICENSE.txt
|
23
24
|
- README.md
|
24
25
|
- Rakefile
|
25
26
|
- lib/open_nlp.rb
|
26
27
|
- lib/open_nlp/chunker.rb
|
28
|
+
- lib/open_nlp/java_class.rb
|
27
29
|
- lib/open_nlp/model.rb
|
28
30
|
- lib/open_nlp/model/chunker.rb
|
29
31
|
- lib/open_nlp/model/detokenizer.rb
|
30
32
|
- lib/open_nlp/model/named_entity_detector.rb
|
33
|
+
- lib/open_nlp/model/parser.rb
|
31
34
|
- lib/open_nlp/model/pos_tagger.rb
|
32
35
|
- lib/open_nlp/model/sentence_detector.rb
|
33
36
|
- lib/open_nlp/model/tokenizer.rb
|
34
37
|
- lib/open_nlp/named_entity_detector.rb
|
35
38
|
- lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
|
36
39
|
- lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
|
40
|
+
- lib/open_nlp/parser.rb
|
41
|
+
- lib/open_nlp/parser/parse.rb
|
37
42
|
- lib/open_nlp/pos_tagger.rb
|
38
43
|
- lib/open_nlp/sentence_detector.rb
|
39
44
|
- lib/open_nlp/tokenizer.rb
|
40
45
|
- lib/open_nlp/tool.rb
|
46
|
+
- lib/open_nlp/utils/span.rb
|
41
47
|
- lib/open_nlp/version.rb
|
42
48
|
- open_nlp.gemspec
|
43
49
|
- spec/chunker_spec.rb
|
44
50
|
- spec/fixtures/en-chunker.bin
|
45
51
|
- spec/fixtures/en-detokenizer.xml
|
46
52
|
- spec/fixtures/en-ner-time.bin
|
53
|
+
- spec/fixtures/en-parser-chunking.bin
|
47
54
|
- spec/fixtures/en-pos-maxent.bin
|
48
55
|
- spec/fixtures/en-sent.bin
|
49
56
|
- spec/fixtures/en-token.bin
|
@@ -54,11 +61,13 @@ files:
|
|
54
61
|
- spec/model/sentence_detector_spec.rb
|
55
62
|
- spec/model/tokenizer_spec.rb
|
56
63
|
- spec/named_entity_detector_spec.rb
|
64
|
+
- spec/parser/parse_spec.rb
|
65
|
+
- spec/parser_spec.rb
|
57
66
|
- spec/pos_tagger_spec.rb
|
58
67
|
- spec/sentence_detector_spec.rb
|
59
68
|
- spec/spec_helper.rb
|
60
69
|
- spec/tokenizer_spec.rb
|
61
|
-
homepage:
|
70
|
+
homepage: http://github.com/hck/open_nlp
|
62
71
|
licenses: []
|
63
72
|
post_install_message:
|
64
73
|
rdoc_options: []
|
@@ -89,6 +98,7 @@ test_files:
|
|
89
98
|
- spec/fixtures/en-chunker.bin
|
90
99
|
- spec/fixtures/en-detokenizer.xml
|
91
100
|
- spec/fixtures/en-ner-time.bin
|
101
|
+
- spec/fixtures/en-parser-chunking.bin
|
92
102
|
- spec/fixtures/en-pos-maxent.bin
|
93
103
|
- spec/fixtures/en-sent.bin
|
94
104
|
- spec/fixtures/en-token.bin
|
@@ -99,6 +109,8 @@ test_files:
|
|
99
109
|
- spec/model/sentence_detector_spec.rb
|
100
110
|
- spec/model/tokenizer_spec.rb
|
101
111
|
- spec/named_entity_detector_spec.rb
|
112
|
+
- spec/parser/parse_spec.rb
|
113
|
+
- spec/parser_spec.rb
|
102
114
|
- spec/pos_tagger_spec.rb
|
103
115
|
- spec/sentence_detector_spec.rb
|
104
116
|
- spec/spec_helper.rb
|