open_nlp 0.0.2-java → 0.0.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour
data/README.md CHANGED
@@ -6,6 +6,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
6
6
  * part-of-speech tagging
7
7
  * named entity extraction
8
8
  * chunks detection
9
+ * parsing
9
10
 
10
11
  ## Installation
11
12
 
@@ -29,17 +30,20 @@ To use open_nlp classes, you need to require it in your sources
29
30
 
30
31
  Then you can create instances of open_nlp classes and use it for your nlp tasks
31
32
 
32
- # sentence detection
33
+ ### Sentence detection
34
+
33
35
  sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
34
36
  sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
35
37
  sentence_detector.detect('The red fox sleeps soundly.')
36
38
 
37
- # tokenize
39
+ ### Tokenize
40
+
38
41
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
39
42
  tokenizer = OpenNlp::Tokenizer.new(token_model)
40
43
  tokenizer.tokenize('The red fox sleeps soundly.')
41
44
 
42
- # part-of-speech tagging
45
+ ### Part-of-speech tagging
46
+
43
47
  pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
44
48
  pos_tagger = OpenNlp::POSTagger.new(pos_model)
45
49
 
@@ -49,11 +53,31 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
49
53
  # to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
50
54
  pos_tagger.tag(%w|The red fox sleeps soundly .|)
51
55
 
52
- # chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
56
+ ### Chunks detection
57
+
58
+ # chunker also needs tokenizer and pos-tagger models
59
+ # because it uses tokenizing and pos-tagging inside chunk task
53
60
  chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
54
61
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
55
62
  pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
56
63
  chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
64
+ chunker.chunk('The red fox sleeps soundly.')
65
+
66
+ ### Parsing
67
+
68
+ # parser also needs tokenizer model because it uses tokenizer inside parse task
69
+ parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
70
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
71
+ parser = OpenNlp::Parser.new(parse_model, token_model)
72
+
73
+ # the result will be an instance of OpenNlp::Parser::Parse
74
+ parse_info = parser.parse('The red fox sleeps soundly.')
75
+
76
+ # you can get tree bank string by calling
77
+ parse_info.tree_bank_string
78
+
79
+ # you can get code tree structure of parse result by calling
80
+ parse_info.code_tree
57
81
 
58
82
  ## Contributing
59
83
 
@@ -36,5 +36,9 @@ module OpenNlp
36
36
  acc
37
37
  end
38
38
  end
39
+
40
+ def get_last_probabilities
41
+ @j_instance.probs.to_ary
42
+ end
39
43
  end
40
44
  end
@@ -0,0 +1,17 @@
1
+ module OpenNlp
2
+ module JavaClass
3
+ def self.included(base)
4
+ base.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def java_class=(value)
9
+ @java_class = value
10
+ end
11
+
12
+ def java_class
13
+ @java_class
14
+ end
15
+ end
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Chunker < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.chunker.ChunkerModel
2
+ self.java_class = Java::opennlp.tools.chunker.ChunkerModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Detokenizer < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.tokenize.DetokenizationDictionary
2
+ self.java_class = Java::opennlp.tools.tokenize.DetokenizationDictionary
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::NamedEntityDetector < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.namefind.TokenNameFinderModel
2
+ self.java_class = Java::opennlp.tools.namefind.TokenNameFinderModel
3
3
  end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Parser < OpenNlp::Model
2
+ self.java_class = Java::opennlp.tools.parser.ParserModel
3
+ end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::POSTagger < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.postag.POSModel
2
+ self.java_class = Java::opennlp.tools.postag.POSModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::SentenceDetector < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.sentdetect.SentenceModel
2
+ self.java_class = Java::opennlp.tools.sentdetect.SentenceModel
3
3
  end
@@ -1,3 +1,3 @@
1
1
  class OpenNlp::Model::Tokenizer < OpenNlp::Model
2
- self.java_class_name = Java::opennlp.tools.tokenize.TokenizerModel
2
+ self.java_class = Java::opennlp.tools.tokenize.TokenizerModel
3
3
  end
@@ -1,5 +1,7 @@
1
1
  module OpenNlp
2
2
  class Model
3
+ include JavaClass
4
+
3
5
  attr_reader :j_model
4
6
 
5
7
  def initialize(model)
@@ -12,17 +14,7 @@ module OpenNlp
12
14
  raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
13
15
  end
14
16
 
15
- @j_model = self.class.java_class_name.new(model_stream)
16
- end
17
-
18
- class << self
19
- def java_class_name=(value)
20
- @java_class = value
21
- end
22
-
23
- def java_class_name
24
- @java_class
25
- end
17
+ @j_model = self.class.java_class.new(model_stream)
26
18
  end
27
19
  end
28
20
  end
@@ -0,0 +1,54 @@
1
+ module OpenNlp
2
+ class Parser::Parse
3
+ include JavaClass
4
+
5
+ attr_reader :j_instance
6
+
7
+ self.java_class = Java::opennlp.tools.parser.Parse
8
+
9
+ def initialize(java_instance)
10
+ raise ArgumentError, "java_instance must be an instance of #{self.class.java_class.name}" unless java_instance.is_a?(self.class.java_class)
11
+
12
+ @j_instance = java_instance
13
+ end
14
+
15
+ def tree_bank_string
16
+ span = j_instance.getSpan
17
+ text = j_instance.getText
18
+ type = j_instance.getType
19
+ start = span.getStart
20
+
21
+ res = ''
22
+
23
+ res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
24
+
25
+ j_instance.getChildren.each do |c|
26
+ s = c.span
27
+ res << text[start..s.getStart-1] if start < s.getStart
28
+
29
+ subtree = self.class.new(c).tree_bank_string
30
+ res << subtree if subtree
31
+ start = s.getEnd
32
+ end
33
+
34
+ res << text[start..span.getEnd-1] if start < span.getEnd
35
+
36
+ res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
37
+
38
+ res
39
+ end
40
+
41
+ def code_tree
42
+ kids = j_instance.getChildren
43
+
44
+ kids.inject([]) do |acc,kid|
45
+ data = {type: kid.getType, parent_type: self.j_instance.getType, token: kid.toString}
46
+ subtree = self.class.new(kid).code_tree
47
+ data[:children] = subtree unless subtree.empty?
48
+ acc << data
49
+
50
+ acc
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,101 @@
1
+ module OpenNlp
2
+ class Parser < Tool
3
+ def initialize(model, token_model)
4
+ raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
5
+ raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
6
+
7
+ @j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
8
+
9
+ @tokenizer = Tokenizer.new(token_model)
10
+ end
11
+
12
+ def parse(text)
13
+ raise ArgumentError, "str must be a String" unless text.is_a?(String)
14
+ return {} if text.empty?
15
+
16
+ parse_obj = Java::opennlp.tools.parser.Parse.new(
17
+ text.to_java(:String),
18
+ Java::opennlp.tools.util.Span.new(0, text.size),
19
+ Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
20
+ 1.to_java(:Double), # probability ?
21
+ 0.to_java(:Integer) # the token index of the head of this parse
22
+ )
23
+
24
+ tokens = @tokenizer.tokenize(text)
25
+
26
+ tokens.each_with_index do |tok, i|
27
+ start = get_token_offset text, tokens, i
28
+
29
+ parse_obj.insert Java::opennlp.tools.parser.Parse.new(
30
+ text.to_java(:String),
31
+ Java::opennlp.tools.util.Span.new(start, start + tok.size),
32
+ Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
33
+ 0.to_java(:Double),
34
+ i.to_java(:Integer)
35
+ )
36
+ end
37
+
38
+ #code_tree @j_instance.parse(parse_obj)
39
+ Parser::Parse.new(@j_instance.parse(parse_obj))
40
+ end
41
+
42
+ private
43
+ def get_token_offset(text, tokens, index)
44
+ offset = 0
45
+
46
+ for i in (1..index) do
47
+ offset = text.index tokens[i], offset + tokens[i - 1].size
48
+ end if index > 0
49
+
50
+ offset
51
+ end
52
+
53
+ #def build_tree(parse_obj)
54
+ # span = parse_obj.getSpan
55
+ # start = span.getStart
56
+ # text = parse_obj.getText
57
+ # type = parse_obj.getType
58
+ #
59
+ # res = {}
60
+ # res[:type] = type unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
61
+ #
62
+ # children = parse_obj.getChildren.inject([]) do |acc,c|
63
+ # s = c.span
64
+ #
65
+ # h = {}
66
+ #
67
+ # if start < s.getStart
68
+ # token = text[start..s.getStart-1]
69
+ # h[:token] = token unless token.strip.empty?
70
+ # end
71
+ #
72
+ # subtree = build_tree(c)
73
+ # h[:children] = subtree unless subtree.empty?
74
+ #
75
+ # start = s.getEnd
76
+ #
77
+ # acc << h
78
+ # acc
79
+ # end
80
+ #
81
+ # res[:token] = text[start..span.getEnd-1] if start < span.getEnd
82
+ #
83
+ # res[:children] = children unless children.empty?
84
+ #
85
+ # res
86
+ #end
87
+
88
+ def code_tree(parse_obj)
89
+ kids = parse_obj.getChildren
90
+
91
+ kids.inject([]) do |acc,kid|
92
+ data = {type: kid.getType, parent_type: parse_obj.getType, token: kid.toString}
93
+ subtree = code_tree(kid)
94
+ data[:children] = subtree unless subtree.empty?
95
+ acc << data
96
+
97
+ acc
98
+ end
99
+ end
100
+ end
101
+ end
@@ -6,5 +6,10 @@ module OpenNlp
6
6
  raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
7
  @j_instance.tokenize(str).to_ary
8
8
  end
9
+
10
+ private
11
+ def get_last_probabilities
12
+ @j_instance.getTokenProbabilities.to_ary
13
+ end
9
14
  end
10
15
  end
data/lib/open_nlp/tool.rb CHANGED
@@ -1,20 +1,12 @@
1
1
  module OpenNlp
2
2
  class Tool
3
+ include JavaClass
4
+
3
5
  attr_reader :j_instance
4
6
 
5
7
  def initialize(model)
6
8
  raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
7
9
  @j_instance = self.class.java_class.new(model.j_model)
8
10
  end
9
-
10
- class << self
11
- def java_class=(value)
12
- @java_class = value
13
- end
14
-
15
- def java_class
16
- @java_class
17
- end
18
- end
19
11
  end
20
12
  end
@@ -0,0 +1,15 @@
1
+ module OpenNlp
2
+ module Utils
3
+ class Span
4
+ include JavaClass
5
+
6
+ self.java_class = Java::opennlp.tools.util.Span
7
+
8
+ attr_reader :j_instance
9
+
10
+ def initialize(start_offset, end_offset)
11
+ @j_instance = self.class.java_class.new(start_offset, end_offset)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
data/lib/open_nlp.rb CHANGED
@@ -4,17 +4,23 @@ require 'java'
4
4
  require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
5
  require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
6
6
 
7
+ require 'open_nlp/java_class'
8
+
7
9
  require 'open_nlp/model'
10
+
8
11
  require 'open_nlp/model/chunker'
9
12
  require 'open_nlp/model/detokenizer'
10
13
  require 'open_nlp/model/named_entity_detector'
11
14
  require 'open_nlp/model/pos_tagger'
12
15
  require 'open_nlp/model/sentence_detector'
13
16
  require 'open_nlp/model/tokenizer'
17
+ require 'open_nlp/model/parser'
14
18
 
15
19
  require 'open_nlp/tool'
16
20
  require 'open_nlp/named_entity_detector'
17
21
  require 'open_nlp/pos_tagger'
18
22
  require 'open_nlp/sentence_detector'
19
23
  require 'open_nlp/tokenizer'
20
- require 'open_nlp/chunker'
24
+ require 'open_nlp/chunker'
25
+ require 'open_nlp/parser'
26
+ require 'open_nlp/parser/parse'
data/open_nlp.gemspec CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |gem|
9
9
  gem.authors = ["Hck"]
10
10
  gem.description = %q{JRuby tools wrapper for Apache OpenNLP}
11
11
  gem.summary = %q{A JRuby wrapper for the Apache OpenNLP tools library}
12
+ gem.homepage = "http://github.com/hck/open_nlp"
12
13
 
13
14
  gem.files = `git ls-files`.split($/)
14
15
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
data/spec/chunker_spec.rb CHANGED
@@ -16,6 +16,14 @@ describe OpenNlp::Chunker do
16
16
  it "should raise an argument error when no model is supplied" do
17
17
  lambda { subject.new(nil, nil, nil) }.should raise_error(ArgumentError)
18
18
  end
19
+
20
+ it "should raise an argument error when no token_model is supplied" do
21
+ lambda { subject.new(model, nil, nil) }.should raise_error(ArgumentError)
22
+ end
23
+
24
+ it "should raise an argument error when no pos_model is supplied" do
25
+ lambda { subject.new(model, token_model, nil) }.should raise_error(ArgumentError)
26
+ end
19
27
  end
20
28
 
21
29
  describe "chunking a string" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Chunker do
7
7
  it "should accept a string filename parameter" do
8
8
  chunker_model = subject.new(model_file_name)
9
9
  chunker_model.should be_a(subject)
10
- chunker_model.j_model.should be_a(subject.java_class_name)
10
+ chunker_model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  chunker_model = subject.new(file_input_stream)
16
16
  chunker_model.should be_a(subject)
17
- chunker_model.j_model.should be_a(subject.java_class_name)
17
+ chunker_model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Detokenizer do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::NamedEntityDetector do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::POSTagger do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::SentenceDetector do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -7,14 +7,14 @@ describe OpenNlp::Model::Tokenizer do
7
7
  it "should accept a string filename parameter" do
8
8
  model = subject.new(model_file_name)
9
9
  model.should be_a(subject)
10
- model.j_model.should be_a(subject.java_class_name)
10
+ model.j_model.should be_a(subject.java_class)
11
11
  end
12
12
 
13
13
  it "should accept a java.io.FileInputStream object" do
14
14
  file_input_stream = java.io.FileInputStream.new(model_file_name)
15
15
  model = subject.new(file_input_stream)
16
16
  model.should be_a(subject)
17
- model.j_model.should be_a(subject.java_class_name)
17
+ model.j_model.should be_a(subject.java_class)
18
18
  end
19
19
 
20
20
  it "should raise an argument error otherwise" do
@@ -0,0 +1,106 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Parser::Parse do
4
+ subject { OpenNlp::Parser::Parse }
5
+ let(:text) { 'The red fox sleeps soundly .' }
6
+ let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
7
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
8
+
9
+ describe "initialization" do
10
+ it "should initialize a new parse object" do
11
+ j_parse = Java::opennlp.tools.parser.Parse.new(
12
+ text.to_java(:String),
13
+ Java::opennlp.tools.util.Span.new(0, text.size),
14
+ Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
15
+ 1.to_java(:Double),
16
+ 0.to_java(:Integer)
17
+ )
18
+
19
+ subject.new(j_parse).should be_a(subject)
20
+ end
21
+
22
+ it "should raise an argument error when no model is supplied" do
23
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
24
+ end
25
+ end
26
+
27
+ describe "#tree_bank_string" do
28
+ it "returns proper string value for parsed text" do
29
+ parser = OpenNlp::Parser.new(model, token_model)
30
+ expected = parser.parse(text).tree_bank_string
31
+ expected.should == "(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))"
32
+ end
33
+ end
34
+
35
+ describe "#code_tree" do
36
+ it "returns proper structure for parsed text" do
37
+ parser = OpenNlp::Parser.new(model, token_model)
38
+ parser.parse(text).code_tree.should == [
39
+ {
40
+ :type => "S",
41
+ :parent_type => "TOP",
42
+ :token => "The red fox sleeps soundly .",
43
+ :children => [
44
+ {
45
+ :type => "NP",
46
+ :parent_type => "S",
47
+ :token => "The red fox",
48
+ :children => [
49
+ {
50
+ :type => "DT",
51
+ :parent_type => "NP",
52
+ :token => "The",
53
+ :children => [{:type => "TK", :parent_type => "DT", :token => "The"}]
54
+ },
55
+ {
56
+ :type => "JJ",
57
+ :parent_type => "NP",
58
+ :token => "red",
59
+ :children => [{:type => "TK", :parent_type => "JJ", :token => "red"}]
60
+ },
61
+ {
62
+ :type => "NN",
63
+ :parent_type => "NP",
64
+ :token => "fox",
65
+ :children => [{:type => "TK", :parent_type => "NN", :token => "fox"}]
66
+ }
67
+ ]
68
+ },
69
+ {
70
+ :type => "VP",
71
+ :parent_type => "S",
72
+ :token => "sleeps soundly",
73
+ :children => [
74
+ {
75
+ :type => "VBZ",
76
+ :parent_type => "VP",
77
+ :token => "sleeps",
78
+ :children => [{:type => "TK", :parent_type => "VBZ", :token => "sleeps"}]
79
+ },
80
+ {
81
+ :type => "ADVP",
82
+ :parent_type => "VP",
83
+ :token => "soundly",
84
+ :children => [
85
+ {
86
+ :type => "RB",
87
+ :parent_type => "ADVP",
88
+ :token => "soundly",
89
+ :children => [{:type => "TK", :parent_type => "RB", :token => "soundly"}]
90
+ }
91
+ ]
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ :type => ".",
97
+ :parent_type => "S",
98
+ :token => ".",
99
+ :children => [{:type => "TK", :parent_type => ".", :token => "."}]
100
+ }
101
+ ]
102
+ }
103
+ ]
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,39 @@
1
+ require "spec_helper"
2
+
3
+ describe OpenNlp::Parser do
4
+ subject { OpenNlp::Parser }
5
+ let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin")) }
6
+ let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, "en-token.bin")) }
7
+
8
+ describe "initialization" do
9
+ it "should initialize a new parser" do
10
+ parser = subject.new(model, token_model)
11
+ parser.should be_a(subject)
12
+ end
13
+
14
+ it "should raise an argument error when no model is supplied" do
15
+ lambda { subject.new(nil, nil) }.should raise_error(ArgumentError)
16
+ end
17
+
18
+ it "should raise an argument error when no token_model is supplied" do
19
+ lambda { subject.new(model, nil) }.should raise_error(ArgumentError)
20
+ end
21
+ end
22
+
23
+ describe "parsing a string" do
24
+ let(:parser) { subject.new(model, token_model) }
25
+
26
+ it "should parse an empty string" do
27
+ parser.parse("").should == {}
28
+ end
29
+
30
+ it "should parse a sentence" do
31
+ res = parser.parse("The red fox sleeps soundly .")
32
+ res.class.should == OpenNlp::Parser::Parse
33
+ end
34
+
35
+ it "should raise an error when not passed a string" do
36
+ lambda { parser.parse(nil) }.should raise_error(ArgumentError)
37
+ end
38
+ end
39
+ end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.2
5
+ version: 0.0.3
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-24 00:00:00.000000000 Z
12
+ date: 2012-09-26 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: JRuby tools wrapper for Apache OpenNLP
15
15
  email:
@@ -18,32 +18,39 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - .gitignore
21
+ - .rspec
21
22
  - Gemfile
22
23
  - LICENSE.txt
23
24
  - README.md
24
25
  - Rakefile
25
26
  - lib/open_nlp.rb
26
27
  - lib/open_nlp/chunker.rb
28
+ - lib/open_nlp/java_class.rb
27
29
  - lib/open_nlp/model.rb
28
30
  - lib/open_nlp/model/chunker.rb
29
31
  - lib/open_nlp/model/detokenizer.rb
30
32
  - lib/open_nlp/model/named_entity_detector.rb
33
+ - lib/open_nlp/model/parser.rb
31
34
  - lib/open_nlp/model/pos_tagger.rb
32
35
  - lib/open_nlp/model/sentence_detector.rb
33
36
  - lib/open_nlp/model/tokenizer.rb
34
37
  - lib/open_nlp/named_entity_detector.rb
35
38
  - lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar
36
39
  - lib/open_nlp/opennlp-tools-1.5.2-incubating.jar
40
+ - lib/open_nlp/parser.rb
41
+ - lib/open_nlp/parser/parse.rb
37
42
  - lib/open_nlp/pos_tagger.rb
38
43
  - lib/open_nlp/sentence_detector.rb
39
44
  - lib/open_nlp/tokenizer.rb
40
45
  - lib/open_nlp/tool.rb
46
+ - lib/open_nlp/utils/span.rb
41
47
  - lib/open_nlp/version.rb
42
48
  - open_nlp.gemspec
43
49
  - spec/chunker_spec.rb
44
50
  - spec/fixtures/en-chunker.bin
45
51
  - spec/fixtures/en-detokenizer.xml
46
52
  - spec/fixtures/en-ner-time.bin
53
+ - spec/fixtures/en-parser-chunking.bin
47
54
  - spec/fixtures/en-pos-maxent.bin
48
55
  - spec/fixtures/en-sent.bin
49
56
  - spec/fixtures/en-token.bin
@@ -54,11 +61,13 @@ files:
54
61
  - spec/model/sentence_detector_spec.rb
55
62
  - spec/model/tokenizer_spec.rb
56
63
  - spec/named_entity_detector_spec.rb
64
+ - spec/parser/parse_spec.rb
65
+ - spec/parser_spec.rb
57
66
  - spec/pos_tagger_spec.rb
58
67
  - spec/sentence_detector_spec.rb
59
68
  - spec/spec_helper.rb
60
69
  - spec/tokenizer_spec.rb
61
- homepage:
70
+ homepage: http://github.com/hck/open_nlp
62
71
  licenses: []
63
72
  post_install_message:
64
73
  rdoc_options: []
@@ -89,6 +98,7 @@ test_files:
89
98
  - spec/fixtures/en-chunker.bin
90
99
  - spec/fixtures/en-detokenizer.xml
91
100
  - spec/fixtures/en-ner-time.bin
101
+ - spec/fixtures/en-parser-chunking.bin
92
102
  - spec/fixtures/en-pos-maxent.bin
93
103
  - spec/fixtures/en-sent.bin
94
104
  - spec/fixtures/en-token.bin
@@ -99,6 +109,8 @@ test_files:
99
109
  - spec/model/sentence_detector_spec.rb
100
110
  - spec/model/tokenizer_spec.rb
101
111
  - spec/named_entity_detector_spec.rb
112
+ - spec/parser/parse_spec.rb
113
+ - spec/parser_spec.rb
102
114
  - spec/pos_tagger_spec.rb
103
115
  - spec/sentence_detector_spec.rb
104
116
  - spec/spec_helper.rb