open_nlp 0.0.2-java → 0.0.3-java

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour
data/README.md CHANGED
@@ -6,6 +6,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
6
6
  * part-of-speech tagging
7
7
  * named entity extraction
8
8
  * chunks detection
9
+ * parsing
9
10
 
10
11
  ## Installation
11
12
 
@@ -29,17 +30,20 @@ To use open_nlp classes, you need to require it in your sources
29
30
 
30
31
  Then you can create instances of open_nlp classes and use it for your nlp tasks
31
32
 
32
- # sentence detection
33
+ ### Sentence detection
34
+
33
35
  sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
34
36
  sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
35
37
  sentence_detector.detect('The red fox sleeps soundly.')
36
38
 
37
- # tokenize
39
+ ### Tokenize
40
+
38
41
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
39
42
  tokenizer = OpenNlp::Tokenizer.new(token_model)
40
43
  tokenizer.tokenize('The red fox sleeps soundly.')
41
44
 
42
- # part-of-speech tagging
45
+ ### Part-of-speech tagging
46
+
43
47
  pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
44
48
  pos_tagger = OpenNlp::POSTagger.new(pos_model)
45
49
 
@@ -49,11 +53,31 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
49
53
  # to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
50
54
  pos_tagger.tag(%w|The red fox sleeps soundly .|)
51
55
 
52
- # chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
56
+ ### Chunks detection
57
+
58
+ # chunker also needs tokenizer and pos-tagger models
59
+ # because it uses tokenizing and pos-tagging inside chunk task
53
60
  chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
54
61
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
55
62
  pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
56
63
  chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
64
+ chunker.chunk('The red fox sleeps soundly.')
65
+
66
+ ### Parsing
67
+
68
+ # parser also needs tokenizer model because it uses tokenizer inside parse task
69
+ parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
70
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
71
+ parser = OpenNlp::Parser.new(parse_model, token_model)
72
+
73
+ # the result will be an instance of OpenNlp::Parser::Parse
74
+ parse_info = parser.parse('The red fox sleeps soundly.')
75
+
76
+ # you can get tree bank string by calling
77
+ parse_info.tree_bank_string
78
+
79
+ # you can get code tree structure of parse result by calling
80
+ parse_info.code_tree
57
81
 
58
82
  ## Contributing
59
83
 
@@ -36,5 +36,9 @@ module OpenNlp
36
36
  acc
37
37
  end
38
38
  end
39
+
40
+ def get_last_probabilities
41
+ @j_instance.probs.to_ary
42
+ end
39
43
  end
40
44
  end
@@ -0,0 +1,17 @@
1
+ module OpenNlp
2
+ module JavaClass
3
+ def self.included(base)
4
+ base.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def java_class=(value)
9
+ @java_class = value
10
+ end
11
+
12
+ def java_class
13
+ @java_class
14
+ end
15
+ end
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Chunker < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.chunker.ChunkerModel
2
+ self.java_class = Java::opennlp.tools.chunker.ChunkerModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Detokenizer < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.tokenize.DetokenizationDictionary
2
+ self.java_class = Java::opennlp.tools.tokenize.DetokenizationDictionary
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::NamedEntityDetector < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.namefind.TokenNameFinderModel
2
+ self.java_class = Java::opennlp.tools.namefind.TokenNameFinderModel
3
3
  end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Parser < OpenNlp::Model
2
+ self.java_class = Java::opennlp.tools.parser.ParserModel
3
+ end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::POSTagger < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.postag.POSModel
2
+ self.java_class = Java::opennlp.tools.postag.POSModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::SentenceDetector < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.sentdetect.SentenceModel
2
+ self.java_class = Java::opennlp.tools.sentdetect.SentenceModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Tokenizer < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.tokenize.TokenizerModel
2
+ self.java_class = Java::opennlp.tools.tokenize.TokenizerModel
3
3
  end
@@ -1,5 +1,7 @@
1
1
  module OpenNlp
2
2
  class Model
3
+ include JavaClass
4
+
3
5
  attr_reader :j_model
4
6
 
5
7
  def initialize(model)
@@ -12,17 +14,7 @@ module OpenNlp
12
14
  raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
13
15
  end
14
16
 
15
- @j_model = self.class.java_class_name.new(model_stream)
16
- end
17
-
18
- class << self
19
- def java_class_name=(value)
20
- @java_class = value
21
- end
22
-
23
- def java_class_name
24
- @java_class
25
- end
17
+ @j_model = self.class.java_class.new(model_stream)
26
18
  end
27
19
  end
28
20
  end
@@ -0,0 +1,54 @@
1
+ module OpenNlp
2
+ class Parser::Parse
3
+ include JavaClass
4
+
5
+ attr_reader :j_instance
6
+
7
+ self.java_class = Java::opennlp.tools.parser.Parse
8
+
9
+ def initialize(java_instance)
10
+ raise ArgumentError, "java_instance must be an instance of #{self.class.java_class.name}" unless java_instance.is_a?(self.class.java_class)
11
+
12
+ @j_instance = java_instance
13
+ end
14
+
15
+ def tree_bank_string
16
+ span = j_instance.getSpan
17
+ text = j_instance.getText
18
+ type = j_instance.getType
19
+ start = span.getStart
20
+
21
+ res = ''
22
+
23
+ res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
24
+
25
+ j_instance.getChildren.each do |c|
26
+ s = c.span
27
+ res << text[start..s.getStart-1] if start < s.getStart
28
+
29
+ subtree = self.class.new(c).tree_bank_string
30
+ res << subtree if subtree
31
+ start = s.getEnd
32
+ end
33
+
34
+ res << text[start..span.getEnd-1] if start < span.getEnd
35
+
36
+ res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
37
+
38
+ res
39
+ end
40
+
41
+ def code_tree
42
+ kids = j_instance.getChildren
43
+
44
+ kids.inject([]) do |acc,kid|
45
+ data = {type: kid.getType, parent_type: self.j_instance.getType, token: kid.toString}
46
+ subtree = self.class.new(kid).code_tree
47
+ data[:children] = subtree unless subtree.empty?
48
+ acc << data
49
+
50
+ acc
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,101 @@
1
+ module OpenNlp
2
+ class Parser < Tool
3
+ def initialize(model, token_model)
4
+ raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
5
+ raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
6
+
7
+ @j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
8
+
9
+ @tokenizer = Tokenizer.new(token_model)
10
+ end
11
+
12
+ def parse(text)
13
+ raise ArgumentError, "str must be a String" unless text.is_a?(String)
14
+ return {} if text.empty?
15
+
16
+ parse_obj = Java::opennlp.tools.parser.Parse.new(
17
+ text.to_java(:String),
18
+ Java::opennlp.tools.util.Span.new(0, text.size),
19
+ Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
20
+ 1.to_java(:Double), # probability ?
21
+ 0.to_java(:Integer) # the token index of the head of this parse
22
+ )
23
+
24
+ tokens = @tokenizer.tokenize(text)
25
+
26
+ tokens.each_with_index do |tok, i|
27
+ start = get_token_offset text, tokens, i
28
+
29
+ parse_obj.insert Java::opennlp.tools.parser.Parse.new(
30
+ text.to_java(:String),
31
+ Java::opennlp.tools.util.Span.new(start, start + tok.size),
32
+ Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
33
+ 0.to_java(:Double),
34
+ i.to_java(:Integer)
35
+ )
36
+ end
37
+
38
+ #code_tree @j_instance.parse(parse_obj)
39
+ Parser::Parse.new(@j_instance.parse(parse_obj))
40
+ end
41
+
42
+ private
43
+ def get_token_offset(text, tokens, index)
44
+ offset = 0
45
+
46
+ for i in (1..index) do
47
+ offset = text.index tokens[i], offset + tokens[i - 1].size
48
+ end if index > 0
49
+
50
+ offset
51
+ end
52
+
53
+ #def build_tree(parse_obj)
54
+ # span = parse_obj.getSpan
55
+ # start = span.getStart
56
+ # text = parse_obj.getText
57
+ # type = parse_obj.getType
58
+ #
59
+ # res = {}
60
+ # res[:type] = type unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
61
+ #
62
+ # children = parse_obj.getChildren.inject([]) do |acc,c|
63
+ # s = c.span
64
+ #
65
+ # h = {}
66
+ #
67
+ # if start < s.getStart
68
+ # token = text[start..s.getStart-1]
69
+ # h[:token] = token unless token.strip.empty?
70
+ # end
71
+ #
72
+ # subtree = build_tree(c)
73
+ # h[:children] = subtree unless subtree.empty?
74
+ #
75
+ # start = s.getEnd
76
+ #
77
+ # acc << h
78
+ # acc
79
+ # end
80
+ #
81
+ # res[:token] = text[start..span.getEnd-1] if start < span.getEnd
82
+ #
83
+ # res[:children] = children unless children.empty?
84
+ #
85
+ # res
86
+ #end
87
+
88
+ def code_tree(parse_obj)
89
+ kids = parse_obj.getChildren
90
+
91
+ kids.inject([]) do |acc,kid|
92
+ data = {type: kid.getType, parent_type: parse_obj.getType, token: kid.toString}
93
+ subtree = code_tree(kid)
94
+ data[:children] = subtree unless subtree.empty?
95
+ acc << data
96
+
97
+ acc
98
+ end
99
+ end
100
+ end
101
+ end
@@ -6,5 +6,10 @@ module OpenNlp
6
6
  raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
7
  @j_instance.tokenize(str).to_ary
8
8
  end
9
+
10
+ private
11
+ def get_last_probabilities
12
+ @j_instance.getTokenProbabilities.to_ary
13
+ end
9
14
  end
10
15
  end
data/lib/open_nlp/tool.rb CHANGED
@@ -1,20 +1,12 @@
1
1
  module OpenNlp
2
2
  class Tool
3
+ include JavaClass
4
+
3
5
  attr_reader :j_instance
4
6
 
5
7
  def initialize(model)
6
8
  raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
7
9
  @j_instance = self.class.java_class.new(model.j_model)
8
10
  end
9
-
10
- class << self
11
- def java_class=(value)
12
- @java_class = value
13
- end
14
-
15
- def java_class
16
- @java_class
17
- end
18
- end
19
11
  end
20
12
  end
@@ -0,0 +1,15 @@
1
+ module OpenNlp
2
+ module Utils
3
+ class Span
4
+ include JavaClass
5
+
6
+ self.java_class = Java::opennlp.tools.util.Span
7
+
8
+ attr_reader :j_instance
9
+
10
+ def initialize(start_offset, end_offset)
11
+ @j_instance = self.class.java_class.new(start_offset, end_offset)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
data/lib/open_nlp.rb CHANGED
@@ -4,17 +4,23 @@ require 'java'
4
4
  require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
5
  require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
6
6
 
7
+ require 'open_nlp/java_class'
8
+
7
9
  require 'open_nlp/model'
10
+
8
11
  require 'open_nlp/model/chunker'
9
12
  require 'open_nlp/model/detokenizer'
10
13
  require 'open_nlp/model/named_entity_detector'
11
14
  require 'open_nlp/model/pos_tagger'
12
15
  require 'open_nlp/model/sentence_detector'
13
16
  require 'open_nlp/model/tokenizer'
17
+ require 'open_nlp/model/parser'
14
18
 
15
19
  require 'open_nlp/tool'
16
20
  require 'open_nlp/named_entity_detector'
17
21
  require 'open_nlp/pos_tagger'
18
22
  require 'open_nlp/sentence_detector'
19
23
  require 'open_nlp/tokenizer'
20
- require 'open_nlp/chunker'
24
+ require 'open_nlp/chunker'
25
+ require 'open_nlp/parser'
26
+ require 'open_nlp/parser/parse'
data/open_nlp.gemspec CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |gem|
9
9
  gem.authors = ["Hck"]
10
10
  gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
11
11
  gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
12
+ gem.homepage = "http://github.com/hck/open_nlp"
12
13
 
13
14
  gem.files = `git ls-files`.split($/)
14
15
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
data/spec/chunker_spec.rb CHANGED
@@ -16,6 +16,14 @@ describe OpenNlp::Chunker do
16
16
  it "should raise an argument error when no model is supplied" do
17
17
  lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
18
18
  end
19
+
20
+ it "should raise an argument error when no token_model is supplied" do
21
+ lambda { subject.new(model, nil, nil) }.should raise_error(ArgumentError)
22
+ end
23
+
24
+ it "should raise an argument error when no pos_model is supplied" do
25
+ lambda { subject.new(model, token_model, nil) }.should raise_error(ArgumentError)
26
+ end
19
27
  end
20
28
 
21
29
  describe "chunking a string" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Chunker do
7
7
  it "should accept a string filename parameter" do
8
8
  chunker_model = subject.new(model_file_name)
9
9
  chunker_model.should be_a(subject)
10
- chunker_model.j_model.should be_a(subject.java_class_name)
10
+ chunker_model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  chunker_model = subject.new(file_input_stream)
16
16
  chunker_model.should be_a(subject)
17
- chunker_model.j_model.should be_a(subject.java_class_name)
17
+ chunker_model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Detokenizer do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::NamedEntityDetector do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::POSTagger do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::SentenceDetector do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Tokenizer do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -0,0 +1,106 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Parser::Parse do
4
+ subject { OpenNlp::Parser::Parse }
5
+ let(:text) { 'The red fox sleeps soundly .' }
6
+ let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
7
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
8
+
9
+ describe "initialization" do
10
+ it "should initialize a new parse object" do
11
+ j_parse = Java::opennlp.tools.parser.Parse.new(
12
+ text.to_java(:String),
13
+ Java::opennlp.tools.util.Span.new(0, text.size),
14
+ Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
15
+ 1.to_java(:Double),
16
+ 0.to_java(:Integer)
17
+ )
18
+
19
+ subject.new(j_parse).should be_a(subject)
20
+ end
21
+
22
+ it "should raise an argument error when no model is supplied" do
23
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
24
+ end
25
+ end
26
+
27
+ describe "#tree_bank_string" do
28
+ it "returns proper string value for parsed text" do
29
+ parser = OpenNlp::Parser.new(model, token_model)
30
+ expected = parser.parse(text).tree_bank_string
31
+ expected.should == "(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))"
32
+ end
33
+ end
34
+
35
+ describe "#code_tree" do
36
+ it "returns proper structure for parsed text" do
37
+ parser = OpenNlp::Parser.new(model, token_model)
38
+ parser.parse(text).code_tree.should == [
39
+ {
40
+ :type => "S",
41
+ :parent_type => "TOP",
42
+ :token => "The red fox sleeps soundly .",
43
+ :children => [
44
+ {
45
+ :type => "NP",
46
+ :parent_type => "S",
47
+ :token => "The red fox",
48
+ :children => [
49
+ {
50
+ :type => "DT",
51
+ :parent_type => "NP",
52
+ :token => "The",
53
+ :children => [{:type => "TK", :parent_type => "DT", :token => "The"}]
54
+ },
55
+ {
56
+ :type => "JJ",
57
+ :parent_type => "NP",
58
+ :token => "red",
59
+ :children => [{:type => "TK", :parent_type => "JJ", :token => "red"}]
60
+ },
61
+ {
62
+ :type => "NN",
63
+ :parent_type => "NP",
64
+ :token => "fox",
65
+ :children => [{:type => "TK", :parent_type => "NN", :token => "fox"}]
66
+ }
67
+ ]
68
+ },
69
+ {
70
+ :type => "VP",
71
+ :parent_type => "S",
72
+ :token => "sleeps soundly",
73
+ :children => [
74
+ {
75
+ :type => "VBZ",
76
+ :parent_type => "VP",
77
+ :token => "sleeps",
78
+ :children => [{:type => "TK", :parent_type => "VBZ", :token => "sleeps"}]
79
+ },
80
+ {
81
+ :type => "ADVP",
82
+ :parent_type => "VP",
83
+ :token => "soundly",
84
+ :children => [
85
+ {
86
+ :type => "RB",
87
+ :parent_type => "ADVP",
88
+ :token => "soundly",
89
+ :children => [{:type => "TK", :parent_type => "RB", :token => "soundly"}]
90
+ }
91
+ ]
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ :type => ".",
97
+ :parent_type => "S",
98
+ :token => ".",
99
+ :children => [{:type => "TK", :parent_type => ".", :token => "."}]
100
+ }
101
+ ]
102
+ }
103
+ ]
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,39 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Parser do
4
+ subject { OpenNlp::Parser }
5
+ let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
6
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize a new parser" do
10
+ parser = subject.new(model, token_model)
11
+ parser.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an argument error when no model is supplied" do
15
+ lambda { subject.new(nil, nil) }.should raise_error(ArgumentError)
16
+ end
17
+
18
+ it "should raise an argument error when no token_model is supplied" do
19
+ lambda { subject.new(model, nil) }.should raise_error(ArgumentError)
20
+ end
21
+ end
22
+
23
+ describe "parsing a string" do
24
+ let(:parser) { subject.new(model, token_model) }
25
+
26
+ it "should parse an empty string" do
27
+ parser.parse("").should == {}
28
+ end
29
+
30
+ it "should parse a sentence" do
31
+ res = parser.parse("The red fox sleeps soundly .")
32
+ res.class.should == OpenNlp::Parser::Parse
33
+ end
34
+
35
+ it "should raise an error when not passed a string" do
36
+ lambda { parser.parse(nil) }.should raise_error(ArgumentError)
37
+ end
38
+ end
39
+ end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.2
5
+ version: 0.0.3
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-24 00:00:00.000000000 Z
12
+ date: 2012-09-26 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: JRuby tools wrapper for Apache OpenNLP
15
15
  email:
@@ -18,32 +18,39 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - .gitignore
21
+ - .rspec
21
22
  - Gemfile
22
23
  - LICENSE.txt
23
24
  - README.md
24
25
  - Rakefile
25
26
  - lib/open_nlp.rb
26
27
  - lib/open_nlp/chunker.rb
28
+ - lib/open_nlp/java_class.rb
27
29
  - lib/open_nlp/model.rb
28
30
  - lib/open_nlp/model/chunker.rb
29
31
  - lib/open_nlp/model/detokenizer.rb
30
32
  - lib/open_nlp/model/named_entity_detector.rb
33
+ - lib/open_nlp/model/parser.rb
31
34
  - lib/open_nlp/model/pos_tagger.rb
32
35
  - lib/open_nlp/model/sentence_detector.rb
33
36
  - lib/open_nlp/model/tokenizer.rb
34
37
  - lib/open_nlp/named_entity_detector.rb
35
38
  - lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
36
39
  - lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
40
+ - lib/open_nlp/parser.rb
41
+ - lib/open_nlp/parser/parse.rb
37
42
  - lib/open_nlp/pos_tagger.rb
38
43
  - lib/open_nlp/sentence_detector.rb
39
44
  - lib/open_nlp/tokenizer.rb
40
45
  - lib/open_nlp/tool.rb
46
+ - lib/open_nlp/utils/span.rb
41
47
  - lib/open_nlp/version.rb
42
48
  - open_nlp.gemspec
43
49
  - spec/chunker_spec.rb
44
50
  - spec/fixtures/en-chunker.bin
45
51
  - spec/fixtures/en-detokenizer.xml
46
52
  - spec/fixtures/en-ner-time.bin
53
+ - spec/fixtures/en-parser-chunking.bin
47
54
  - spec/fixtures/en-pos-maxent.bin
48
55
  - spec/fixtures/en-sent.bin
49
56
  - spec/fixtures/en-token.bin
@@ -54,11 +61,13 @@ files:
54
61
  - spec/model/sentence_detector_spec.rb
55
62
  - spec/model/tokenizer_spec.rb
56
63
  - spec/named_entity_detector_spec.rb
64
+ - spec/parser/parse_spec.rb
65
+ - spec/parser_spec.rb
57
66
  - spec/pos_tagger_spec.rb
58
67
  - spec/sentence_detector_spec.rb
59
68
  - spec/spec_helper.rb
60
69
  - spec/tokenizer_spec.rb
61
- homepage:
70
+ homepage: http://github.com/hck/open_nlp
62
71
  licenses: []
63
72
  post_install_message:
64
73
  rdoc_options: []
@@ -89,6 +98,7 @@ test_files:
89
98
  - spec/fixtures/en-chunker.bin
90
99
  - spec/fixtures/en-detokenizer.xml
91
100
  - spec/fixtures/en-ner-time.bin
101
+ - spec/fixtures/en-parser-chunking.bin
92
102
  - spec/fixtures/en-pos-maxent.bin
93
103
  - spec/fixtures/en-sent.bin
94
104
  - spec/fixtures/en-token.bin
@@ -99,6 +109,8 @@ test_files:
99
109
  - spec/model/sentence_detector_spec.rb
100
110
  - spec/model/tokenizer_spec.rb
101
111
  - spec/named_entity_detector_spec.rb
112
+ - spec/parser/parse_spec.rb
113
+ - spec/parser_spec.rb
102
114
  - spec/pos_tagger_spec.rb
103
115
  - spec/sentence_detector_spec.rb
104
116
  - spec/spec_helper.rb