open_nlp 0.0.7-java → 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -2
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile.lock +31 -0
  6. data/README.md +8 -1
  7. data/lib/open_nlp.rb +3 -3
  8. data/lib/open_nlp/categorizer.rb +7 -3
  9. data/lib/open_nlp/chunker.rb +19 -8
  10. data/lib/open_nlp/model.rb +13 -9
  11. data/lib/open_nlp/named_entity_detector.rb +6 -2
  12. data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
  13. data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
  14. data/lib/open_nlp/parser.rb +43 -33
  15. data/lib/open_nlp/parser/parse.rb +12 -21
  16. data/lib/open_nlp/pos_tagger.rb +5 -2
  17. data/lib/open_nlp/sentence_detector.rb +16 -6
  18. data/lib/open_nlp/tokenizer.rb +8 -3
  19. data/lib/open_nlp/tool.rb +1 -1
  20. data/lib/open_nlp/util.rb +1 -2
  21. data/lib/open_nlp/util/span.rb +5 -5
  22. data/lib/open_nlp/version.rb +1 -1
  23. data/spec/categorizer_spec.rb +24 -22
  24. data/spec/chunker_spec.rb +29 -28
  25. data/spec/model/chunker_spec.rb +12 -15
  26. data/spec/model/detokenizer_spec.rb +11 -14
  27. data/spec/model/named_entity_detector_spec.rb +11 -14
  28. data/spec/model/pos_tagger_spec.rb +12 -15
  29. data/spec/model/sentence_detector_spec.rb +11 -14
  30. data/spec/model/tokenizer_spec.rb +11 -14
  31. data/spec/named_entity_detector_spec.rb +28 -27
  32. data/spec/parser/parse_spec.rb +64 -56
  33. data/spec/parser_spec.rb +26 -21
  34. data/spec/pos_tagger_spec.rb +22 -23
  35. data/spec/sentence_detector_spec.rb +39 -30
  36. data/spec/spec_helper.rb +1 -1
  37. data/spec/tokenizer_spec.rb +26 -22
  38. metadata +16 -17
  39. data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
  40. data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 57b451e151cf3a0ed7d28b9451f48b7ec7a31288
4
+ data.tar.gz: fc5e0b61f0baf6673a4a3dbc05e4c7828d2a935c
5
+ SHA512:
6
+ metadata.gz: 420670f2e006071f47ba5d18f6152427ba49745b2647a7b5e7520acc987575ce02cc0e3cecbd3417f65d7d4c30dfe63740ac9c422a1f43d9a0ed5bb1b6a572a1
7
+ data.tar.gz: 059bf4f29726f027b634d802cb8f4ff1b1cb9b04ef6315a4d5e77d44098e1eed1a651edd97cc7d37a6ed4b91a6a6e03bb571aef45e996919fcfbec481ce5f269
data/.gitignore CHANGED
@@ -3,7 +3,6 @@
3
3
  .bundle
4
4
  .config
5
5
  .yardoc
6
- Gemfile.lock
7
6
  InstalledFiles
8
7
  _yardoc
9
8
  coverage
@@ -15,4 +14,5 @@ spec/reports
15
14
  test/tmp
16
15
  test/version_tmp
17
16
  tmp
18
- .idea
17
+ .idea
18
+ .gems
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.5.0
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby-19mode
4
+ - jruby-1.7.20
5
+ - jruby-9.0.5.0
6
+ script: JRUBY_OPTS=-J-Xmx768m bundle exec rspec spec
data/Gemfile.lock ADDED
@@ -0,0 +1,31 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+
5
+ GEM
6
+ remote: https://rubygems.org/
7
+ specs:
8
+ diff-lcs (1.2.5)
9
+ rspec (3.4.0)
10
+ rspec-core (~> 3.4.0)
11
+ rspec-expectations (~> 3.4.0)
12
+ rspec-mocks (~> 3.4.0)
13
+ rspec-core (3.4.4)
14
+ rspec-support (~> 3.4.0)
15
+ rspec-expectations (3.4.0)
16
+ diff-lcs (>= 1.2.0, < 2.0)
17
+ rspec-support (~> 3.4.0)
18
+ rspec-mocks (3.4.1)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.4.0)
21
+ rspec-support (3.4.1)
22
+
23
+ PLATFORMS
24
+ java
25
+
26
+ DEPENDENCIES
27
+ open_nlp!
28
+ rspec
29
+
30
+ BUNDLED WITH
31
+ 1.11.2
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # OpenNlp
2
2
 
3
+ [![Build Status](https://travis-ci.org/hck/open_nlp.png?branch=master)](https://travis-ci.org/hck/open_nlp) [![Code Climate](https://codeclimate.com/github/hck/open_nlp.png)](https://codeclimate.com/github/hck/open_nlp)
4
+
3
5
  A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
4
6
  * sentence detection
5
7
  * tokenize
@@ -35,8 +37,13 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
35
37
 
36
38
  sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
37
39
  sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
40
+
41
+ # get sentences as array of strings
38
42
  sentence_detector.detect('The red fox sleeps soundly.')
39
43
 
44
+ # get array of OpenNLP::Util::Span objects:
45
+ sentence_detector.pos_detect('"The sky is blue. The Grass is green."')
46
+
40
47
  ### Tokenize
41
48
 
42
49
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
@@ -92,4 +99,4 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
92
99
  2. Create your feature branch (`git checkout -b my-new-feature`)
93
100
  3. Commit your changes (`git commit -am 'Add some feature'`)
94
101
  4. Push to the branch (`git push origin my-new-feature`)
95
- 5. Create new Pull Request
102
+ 5. Create new Pull Request
data/lib/open_nlp.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  require 'open_nlp/version'
2
2
 
3
3
  require 'java'
4
- require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
- require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
4
+ require 'open_nlp/opennlp-tools-1.5.3.jar'
5
+ require 'open_nlp/opennlp-maxent-3.0.3.jar'
6
6
 
7
7
  require 'open_nlp/java_class'
8
8
 
@@ -28,4 +28,4 @@ require 'open_nlp/sentence_detector'
28
28
  require 'open_nlp/tokenizer'
29
29
  require 'open_nlp/chunker'
30
30
  require 'open_nlp/parser'
31
- require 'open_nlp/parser/parse'
31
+ require 'open_nlp/parser/parse'
@@ -2,11 +2,15 @@ module OpenNlp
2
2
  class Categorizer < Tool
3
3
  self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
4
4
 
5
+ # Categorizes a string passed as parameter to one of the categories
6
+ #
7
+ # @param [String] str string to be categorized
8
+ # @return [String] category
5
9
  def categorize(str)
6
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
10
+ fail ArgumentError, 'str param must be a String' unless str.is_a?(String)
7
11
 
8
- outcomes = @j_instance.categorize(str)
9
- @j_instance.getBestCategory(outcomes)
12
+ outcomes = j_instance.categorize(str)
13
+ j_instance.getBestCategory(outcomes)
10
14
  end
11
15
  end
12
16
  end
@@ -5,27 +5,38 @@ module OpenNlp
5
5
  def initialize(model, token_model, pos_model)
6
6
  super(model)
7
7
 
8
- raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
9
- raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger)
8
+ unless token_model.is_a?(Model::Tokenizer)
9
+ fail ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model'
10
+ end
11
+
12
+ unless pos_model.is_a?(Model::POSTagger)
13
+ fail ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model'
14
+ end
10
15
 
11
16
  @tokenizer = Tokenizer.new(token_model)
12
17
  @pos_tagger = POSTagger.new(pos_model)
13
18
  end
14
19
 
20
+ # Chunks a string into part-of-sentence pieces
21
+ #
22
+ # @param [String] str string to chunk
23
+ # @return [Array] array of chunks with part-of-sentence information
15
24
  def chunk(str)
16
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
25
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
17
26
 
18
- tokens = @tokenizer.tokenize(str)
19
- pos_tags = @pos_tagger.tag(tokens).to_ary
27
+ tokens = tokenizer.tokenize(str)
28
+ pos_tags = pos_tagger.tag(tokens).to_ary
20
29
 
21
- chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
30
+ chunks = j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
22
31
 
23
32
  build_chunks(chunks, tokens, pos_tags)
24
33
  end
25
34
 
26
35
  private
36
+
37
+ attr_reader :tokenizer, :pos_tagger
38
+
27
39
  def build_chunks(chunks, tokens, pos_tags)
28
- # data[i] = [token, pos_tag, chunk_val]
29
40
  data = tokens.zip(pos_tags, chunks)
30
41
 
31
42
  data.inject([]) do |acc, val|
@@ -45,7 +56,7 @@ module OpenNlp
45
56
  end
46
57
 
47
58
  def get_last_probabilities
48
- @j_instance.probs.to_ary
59
+ j_instance.probs.to_ary
49
60
  end
50
61
  end
51
62
  end
@@ -5,16 +5,20 @@ module OpenNlp
5
5
  attr_reader :j_model
6
6
 
7
7
  def initialize(model)
8
- model_stream = case model
9
- when java.io.FileInputStream
10
- model
11
- when String
12
- java.io.FileInputStream.new(model)
13
- else
14
- raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
15
- end
8
+ @j_model = self.class.java_class.new(model_stream(model))
9
+ end
10
+
11
+ private
16
12
 
17
- @j_model = self.class.java_class.new(model_stream)
13
+ def model_stream(model)
14
+ case model
15
+ when java.io.FileInputStream
16
+ model
17
+ when String
18
+ java.io.FileInputStream.new(model)
19
+ else
20
+ fail ArgumentError, 'Model must be either a string or a java.io.FileInputStream'
21
+ end
18
22
  end
19
23
  end
20
24
  end
@@ -2,9 +2,13 @@ module OpenNlp
2
2
  class NamedEntityDetector < Tool
3
3
  self.java_class = Java::opennlp.tools.namefind.NameFinderME
4
4
 
5
+ # Detects names for provided array of tokens
6
+ #
7
+ # @param [Array<String>] tokens tokens to run name detection on
8
+ # @return [Array<Java::opennlp.tools.util.Span>] names detected
5
9
  def detect(tokens)
6
- raise ArgumentError, "tokens must be an instance of Array" unless tokens.is_a?(Array)
7
- @j_instance.find(tokens.to_java(:String)).to_ary
10
+ fail ArgumentError, 'tokens must be an instance of Array' unless tokens.is_a?(Array)
11
+ j_instance.find(tokens.to_java(:String)).to_ary
8
12
  end
9
13
  end
10
14
  end
@@ -1,52 +1,62 @@
1
1
  module OpenNlp
2
2
  class Parser < Tool
3
- def initialize(model, token_model)
4
- raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
5
- raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
3
+ def initialize(parser_model, token_model)
4
+ unless parser_model.is_a?(OpenNlp::Model)
5
+ fail ArgumentError, 'parser_model must be an OpenNlp::Model'
6
+ end
6
7
 
7
- @j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
8
+ unless token_model.is_a?(Model::Tokenizer)
9
+ fail ArgumentError, 'token_model must be an OpenNlp::Tokenizer::Model'
10
+ end
8
11
 
12
+ @j_instance = Java::opennlp.tools.parser.ParserFactory.create(parser_model.j_model)
9
13
  @tokenizer = Tokenizer.new(token_model)
10
14
  end
11
15
 
16
+ # Parses text into instance of Parse class
17
+ #
18
+ # @param [String] text text to parse
19
+ # @return [OpenNlp::Parser::Parse]
12
20
  def parse(text)
13
- raise ArgumentError, "str must be a String" unless text.is_a?(String)
14
- return {} if text.empty?
15
-
16
- parse_obj = Java::opennlp.tools.parser.Parse.new(
17
- text.to_java(:String),
18
- Java::opennlp.tools.util.Span.new(0, text.size),
19
- Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
20
- 1.to_java(:Double), # probability ?
21
- 0.to_java(:Integer) # the token index of the head of this parse
22
- )
23
-
24
- tokens = @tokenizer.tokenize(text)
25
-
26
- tokens.each_with_index do |tok, i|
27
- start = get_token_offset text, tokens, i
28
-
29
- parse_obj.insert Java::opennlp.tools.parser.Parse.new(
30
- text.to_java(:String),
31
- Java::opennlp.tools.util.Span.new(start, start + tok.size),
32
- Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
33
- 0.to_java(:Double),
34
- i.to_java(:Integer)
35
- )
36
- end
37
-
38
- Parser::Parse.new(@j_instance.parse(parse_obj))
21
+ raise ArgumentError, 'passed text must be a String' unless text.is_a?(String)
22
+ text.empty? ? {} : parse_tokens(tokenizer.tokenize(text), text)
39
23
  end
40
24
 
41
25
  private
26
+
27
+ attr_reader :tokenizer
28
+
42
29
  def get_token_offset(text, tokens, index)
43
30
  offset = 0
31
+ return offset unless index > 0
44
32
 
45
33
  for i in (1..index) do
46
34
  offset = text.index tokens[i], offset + tokens[i - 1].size
47
- end if index > 0
48
-
35
+ end
49
36
  offset
50
37
  end
38
+
39
+ def build_parse_obj(text, span_start, span_end, type=Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability=1, token_index=0)
40
+ Java::opennlp.tools.parser.Parse.new(
41
+ text.to_java(:String),
42
+ Java::opennlp.tools.util.Span.new(span_start, span_end),
43
+ type.to_java(:String),
44
+ probability.to_java(:Double), # probability ?
45
+ token_index.to_java(:Integer) # the token index of the head of this parse
46
+ )
47
+ end
48
+
49
+ def parse_tokens(tokens, text)
50
+ parse_obj = build_parse_obj(text, 0, text.size)
51
+ parse_type = Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
52
+
53
+ tokens.each_with_index do |tok, i|
54
+ start = get_token_offset(text, tokens, i)
55
+ token_parse = build_parse_obj(text, start, start + tok.size, parse_type, 0, i)
56
+ parse_obj.insert(token_parse)
57
+ end
58
+
59
+ Parser::Parse.new(j_instance.parse(parse_obj))
60
+ end
51
61
  end
52
- end
62
+ end
@@ -13,27 +13,20 @@ module OpenNlp
13
13
  end
14
14
 
15
15
  def tree_bank_string
16
- span = j_instance.getSpan
17
- text = j_instance.getText
18
- type = j_instance.getType
19
- start = span.getStart
16
+ span, text, type, res = j_instance.getSpan, j_instance.getText, j_instance.getType, ''
17
+ start = span.getStart
20
18
 
21
- res = ''
19
+ res << "(#{type} " if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
22
20
 
23
- res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
24
-
25
- j_instance.getChildren.each do |c|
26
- s = c.span
27
- res << text[start..s.getStart-1] if start < s.getStart
28
-
29
- subtree = self.class.new(c).tree_bank_string
30
- res << subtree if subtree
31
- start = s.getEnd
21
+ j_instance.getChildren.each do |child|
22
+ child_span = child.span
23
+ res << text[start..child_span.getStart-1] if start < child_span.getStart
24
+ res << self.class.new(child).tree_bank_string
25
+ start = child_span.getEnd
32
26
  end
33
27
 
34
28
  res << text[start..span.getEnd-1] if start < span.getEnd
35
-
36
- res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
29
+ res << ")" if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
37
30
 
38
31
  res
39
32
  end
@@ -41,14 +34,12 @@ module OpenNlp
41
34
  def code_tree
42
35
  kids = j_instance.getChildren
43
36
 
44
- kids.inject([]) do |acc,kid|
45
- data = {:type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString}
37
+ kids.each_with_object([]) do |kid, acc|
38
+ data = { :type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString }
46
39
  subtree = self.class.new(kid).code_tree
47
40
  data[:children] = subtree unless subtree.empty?
48
41
  acc << data
49
-
50
- acc
51
42
  end
52
43
  end
53
44
  end
54
- end
45
+ end
@@ -3,8 +3,11 @@ module OpenNlp
3
3
  self.java_class = Java::opennlp.tools.postag.POSTaggerME
4
4
 
5
5
  def tag(tokens)
6
- raise ArgumentError, "tokens must be an instance of String or Array" unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
- @j_instance.tag(tokens.to_java(:String))
6
+ unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
+ fail ArgumentError, 'tokens must be an instance of String or Array'
8
+ end
9
+
10
+ j_instance.tag(tokens.to_java(:String))
8
11
  end
9
12
  end
10
13
  end
@@ -2,14 +2,24 @@ module OpenNlp
2
2
  class SentenceDetector < Tool
3
3
  self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
4
4
 
5
- def detect(string)
6
- raise ArgumentError, "string must be a String" unless string.is_a?(String)
7
- @j_instance.sentDetect(string).to_ary
5
+ # Detects sentences in a string
6
+ #
7
+ # @param [String] string string to detect sentences in
8
+ # @return [Array<String>] array of detected sentences
9
+ def detect(str)
10
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
11
+ j_instance.sentDetect(str).to_ary
8
12
  end
9
13
 
10
- def pos_detect(string)
11
- raise ArgumentError, "string must be a String" unless string.is_a?(String)
12
- @j_instance.sentPosDetect(string).map{|span| OpenNlp::Util::Span.new(span.getStart, span.getEnd)}
14
+ # Detects sentences in a string and returns array of spans
15
+ #
16
+ # @param [String] str
17
+ # @return [Array<OpenNlp::Util::Span>] array of spans for detected sentences
18
+ def pos_detect(str)
19
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
20
+ j_instance.sentPosDetect(str).map do |span|
21
+ OpenNlp::Util::Span.new(span.getStart, span.getEnd)
22
+ end
13
23
  end
14
24
  end
15
25
  end
@@ -2,14 +2,19 @@ module OpenNlp
2
2
  class Tokenizer < Tool
3
3
  self.java_class = Java::opennlp.tools.tokenize.TokenizerME
4
4
 
5
+ # Tokenizes a string
6
+ #
7
+ # @param [String] str string to tokenize
8
+ # @return [Array] array of string tokens
5
9
  def tokenize(str)
6
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
- @j_instance.tokenize(str).to_ary
10
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
11
+ j_instance.tokenize(str).to_ary
8
12
  end
9
13
 
10
14
  private
15
+
11
16
  def get_last_probabilities
12
- @j_instance.getTokenProbabilities.to_ary
17
+ j_instance.getTokenProbabilities.to_ary
13
18
  end
14
19
  end
15
20
  end