open_nlp 0.0.7-java → 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -2
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile.lock +31 -0
  6. data/README.md +8 -1
  7. data/lib/open_nlp.rb +3 -3
  8. data/lib/open_nlp/categorizer.rb +7 -3
  9. data/lib/open_nlp/chunker.rb +19 -8
  10. data/lib/open_nlp/model.rb +13 -9
  11. data/lib/open_nlp/named_entity_detector.rb +6 -2
  12. data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
  13. data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
  14. data/lib/open_nlp/parser.rb +43 -33
  15. data/lib/open_nlp/parser/parse.rb +12 -21
  16. data/lib/open_nlp/pos_tagger.rb +5 -2
  17. data/lib/open_nlp/sentence_detector.rb +16 -6
  18. data/lib/open_nlp/tokenizer.rb +8 -3
  19. data/lib/open_nlp/tool.rb +1 -1
  20. data/lib/open_nlp/util.rb +1 -2
  21. data/lib/open_nlp/util/span.rb +5 -5
  22. data/lib/open_nlp/version.rb +1 -1
  23. data/spec/categorizer_spec.rb +24 -22
  24. data/spec/chunker_spec.rb +29 -28
  25. data/spec/model/chunker_spec.rb +12 -15
  26. data/spec/model/detokenizer_spec.rb +11 -14
  27. data/spec/model/named_entity_detector_spec.rb +11 -14
  28. data/spec/model/pos_tagger_spec.rb +12 -15
  29. data/spec/model/sentence_detector_spec.rb +11 -14
  30. data/spec/model/tokenizer_spec.rb +11 -14
  31. data/spec/named_entity_detector_spec.rb +28 -27
  32. data/spec/parser/parse_spec.rb +64 -56
  33. data/spec/parser_spec.rb +26 -21
  34. data/spec/pos_tagger_spec.rb +22 -23
  35. data/spec/sentence_detector_spec.rb +39 -30
  36. data/spec/spec_helper.rb +1 -1
  37. data/spec/tokenizer_spec.rb +26 -22
  38. metadata +16 -17
  39. data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
  40. data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 57b451e151cf3a0ed7d28b9451f48b7ec7a31288
4
+ data.tar.gz: fc5e0b61f0baf6673a4a3dbc05e4c7828d2a935c
5
+ SHA512:
6
+ metadata.gz: 420670f2e006071f47ba5d18f6152427ba49745b2647a7b5e7520acc987575ce02cc0e3cecbd3417f65d7d4c30dfe63740ac9c422a1f43d9a0ed5bb1b6a572a1
7
+ data.tar.gz: 059bf4f29726f027b634d802cb8f4ff1b1cb9b04ef6315a4d5e77d44098e1eed1a651edd97cc7d37a6ed4b91a6a6e03bb571aef45e996919fcfbec481ce5f269
data/.gitignore CHANGED
@@ -3,7 +3,6 @@
3
3
  .bundle
4
4
  .config
5
5
  .yardoc
6
- Gemfile.lock
7
6
  InstalledFiles
8
7
  _yardoc
9
8
  coverage
@@ -15,4 +14,5 @@ spec/reports
15
14
  test/tmp
16
15
  test/version_tmp
17
16
  tmp
18
- .idea
17
+ .idea
18
+ .gems
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.5.0
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby-19mode
4
+ - jruby-1.7.20
5
+ - jruby-9.0.5.0
6
+ script: JRUBY_OPTS=-J-Xmx768m bundle exec rspec spec
data/Gemfile.lock ADDED
@@ -0,0 +1,31 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+
5
+ GEM
6
+ remote: https://rubygems.org/
7
+ specs:
8
+ diff-lcs (1.2.5)
9
+ rspec (3.4.0)
10
+ rspec-core (~> 3.4.0)
11
+ rspec-expectations (~> 3.4.0)
12
+ rspec-mocks (~> 3.4.0)
13
+ rspec-core (3.4.4)
14
+ rspec-support (~> 3.4.0)
15
+ rspec-expectations (3.4.0)
16
+ diff-lcs (>= 1.2.0, < 2.0)
17
+ rspec-support (~> 3.4.0)
18
+ rspec-mocks (3.4.1)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.4.0)
21
+ rspec-support (3.4.1)
22
+
23
+ PLATFORMS
24
+ java
25
+
26
+ DEPENDENCIES
27
+ open_nlp!
28
+ rspec
29
+
30
+ BUNDLED WITH
31
+ 1.11.2
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # OpenNlp
2
2
 
3
+ [![Build Status](https://travis-ci.org/hck/open_nlp.png?branch=master)](https://travis-ci.org/hck/open_nlp) [![Code Climate](https://codeclimate.com/github/hck/open_nlp.png)](https://codeclimate.com/github/hck/open_nlp)
4
+
3
5
  A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
4
6
  * sentence detection
5
7
  * tokenize
@@ -35,8 +37,13 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
35
37
 
36
38
  sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
37
39
  sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
40
+
41
+ # get sentences as array of strings
38
42
  sentence_detector.detect('The red fox sleeps soundly.')
39
43
 
44
+ # get array of OpenNLP::Util::Span objects:
45
+ sentence_detector.pos_detect('"The sky is blue. The Grass is green."')
46
+
40
47
  ### Tokenize
41
48
 
42
49
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
@@ -92,4 +99,4 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
92
99
  2. Create your feature branch (`git checkout -b my-new-feature`)
93
100
  3. Commit your changes (`git commit -am 'Add some feature'`)
94
101
  4. Push to the branch (`git push origin my-new-feature`)
95
- 5. Create new Pull Request
102
+ 5. Create new Pull Request
data/lib/open_nlp.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  require 'open_nlp/version'
2
2
 
3
3
  require 'java'
4
- require 'open_nlp/opennlp-tools-1.5.2-incubating.jar'
5
- require 'open_nlp/opennlp-maxent-3.0.2-incubating.jar'
4
+ require 'open_nlp/opennlp-tools-1.5.3.jar'
5
+ require 'open_nlp/opennlp-maxent-3.0.3.jar'
6
6
 
7
7
  require 'open_nlp/java_class'
8
8
 
@@ -28,4 +28,4 @@ require 'open_nlp/sentence_detector'
28
28
  require 'open_nlp/tokenizer'
29
29
  require 'open_nlp/chunker'
30
30
  require 'open_nlp/parser'
31
- require 'open_nlp/parser/parse'
31
+ require 'open_nlp/parser/parse'
@@ -2,11 +2,15 @@ module OpenNlp
2
2
  class Categorizer < Tool
3
3
  self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
4
4
 
5
+ # Categorizes a string passed as parameter to one of the categories
6
+ #
7
+ # @param [String] str string to be categorized
8
+ # @return [String] category
5
9
  def categorize(str)
6
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
10
+ fail ArgumentError, 'str param must be a String' unless str.is_a?(String)
7
11
 
8
- outcomes = @j_instance.categorize(str)
9
- @j_instance.getBestCategory(outcomes)
12
+ outcomes = j_instance.categorize(str)
13
+ j_instance.getBestCategory(outcomes)
10
14
  end
11
15
  end
12
16
  end
@@ -5,27 +5,38 @@ module OpenNlp
5
5
  def initialize(model, token_model, pos_model)
6
6
  super(model)
7
7
 
8
- raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
9
- raise ArgumentError, "model must be an OpenNlp::POSTagger::Model" unless pos_model.is_a?(Model::POSTagger)
8
+ unless token_model.is_a?(Model::Tokenizer)
9
+ fail ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model'
10
+ end
11
+
12
+ unless pos_model.is_a?(Model::POSTagger)
13
+ fail ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model'
14
+ end
10
15
 
11
16
  @tokenizer = Tokenizer.new(token_model)
12
17
  @pos_tagger = POSTagger.new(pos_model)
13
18
  end
14
19
 
20
+ # Chunks a string into part-of-sentence pieces
21
+ #
22
+ # @param [String] str string to chunk
23
+ # @return [Array] array of chunks with part-of-sentence information
15
24
  def chunk(str)
16
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
25
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
17
26
 
18
- tokens = @tokenizer.tokenize(str)
19
- pos_tags = @pos_tagger.tag(tokens).to_ary
27
+ tokens = tokenizer.tokenize(str)
28
+ pos_tags = pos_tagger.tag(tokens).to_ary
20
29
 
21
- chunks = @j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
30
+ chunks = j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
22
31
 
23
32
  build_chunks(chunks, tokens, pos_tags)
24
33
  end
25
34
 
26
35
  private
36
+
37
+ attr_reader :tokenizer, :pos_tagger
38
+
27
39
  def build_chunks(chunks, tokens, pos_tags)
28
- # data[i] = [token, pos_tag, chunk_val]
29
40
  data = tokens.zip(pos_tags, chunks)
30
41
 
31
42
  data.inject([]) do |acc, val|
@@ -45,7 +56,7 @@ module OpenNlp
45
56
  end
46
57
 
47
58
  def get_last_probabilities
48
- @j_instance.probs.to_ary
59
+ j_instance.probs.to_ary
49
60
  end
50
61
  end
51
62
  end
@@ -5,16 +5,20 @@ module OpenNlp
5
5
  attr_reader :j_model
6
6
 
7
7
  def initialize(model)
8
- model_stream = case model
9
- when java.io.FileInputStream
10
- model
11
- when String
12
- java.io.FileInputStream.new(model)
13
- else
14
- raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
15
- end
8
+ @j_model = self.class.java_class.new(model_stream(model))
9
+ end
10
+
11
+ private
16
12
 
17
- @j_model = self.class.java_class.new(model_stream)
13
+ def model_stream(model)
14
+ case model
15
+ when java.io.FileInputStream
16
+ model
17
+ when String
18
+ java.io.FileInputStream.new(model)
19
+ else
20
+ fail ArgumentError, 'Model must be either a string or a java.io.FileInputStream'
21
+ end
18
22
  end
19
23
  end
20
24
  end
@@ -2,9 +2,13 @@ module OpenNlp
2
2
  class NamedEntityDetector < Tool
3
3
  self.java_class = Java::opennlp.tools.namefind.NameFinderME
4
4
 
5
+ # Detects names for provided array of tokens
6
+ #
7
+ # @param [Array<String>] tokens tokens to run name detection on
8
+ # @return [Array<Java::opennlp.tools.util.Span>] names detected
5
9
  def detect(tokens)
6
- raise ArgumentError, "tokens must be an instance of Array" unless tokens.is_a?(Array)
7
- @j_instance.find(tokens.to_java(:String)).to_ary
10
+ fail ArgumentError, 'tokens must be an instance of Array' unless tokens.is_a?(Array)
11
+ j_instance.find(tokens.to_java(:String)).to_ary
8
12
  end
9
13
  end
10
14
  end
@@ -1,52 +1,62 @@
1
1
  module OpenNlp
2
2
  class Parser < Tool
3
- def initialize(model, token_model)
4
- raise ArgumentError, "model must be an OpenNlp::Model" unless model.is_a?(OpenNlp::Model)
5
- raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
3
+ def initialize(parser_model, token_model)
4
+ unless parser_model.is_a?(OpenNlp::Model)
5
+ fail ArgumentError, 'parser_model must be an OpenNlp::Model'
6
+ end
6
7
 
7
- @j_instance = Java::opennlp.tools.parser.ParserFactory.create(model.j_model)
8
+ unless token_model.is_a?(Model::Tokenizer)
9
+ fail ArgumentError, 'token_model must be an OpenNlp::Tokenizer::Model'
10
+ end
8
11
 
12
+ @j_instance = Java::opennlp.tools.parser.ParserFactory.create(parser_model.j_model)
9
13
  @tokenizer = Tokenizer.new(token_model)
10
14
  end
11
15
 
16
+ # Parses text into instance of Parse class
17
+ #
18
+ # @param [String] text text to parse
19
+ # @return [OpenNlp::Parser::Parse]
12
20
  def parse(text)
13
- raise ArgumentError, "str must be a String" unless text.is_a?(String)
14
- return {} if text.empty?
15
-
16
- parse_obj = Java::opennlp.tools.parser.Parse.new(
17
- text.to_java(:String),
18
- Java::opennlp.tools.util.Span.new(0, text.size),
19
- Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
20
- 1.to_java(:Double), # probability ?
21
- 0.to_java(:Integer) # the token index of the head of this parse
22
- )
23
-
24
- tokens = @tokenizer.tokenize(text)
25
-
26
- tokens.each_with_index do |tok, i|
27
- start = get_token_offset text, tokens, i
28
-
29
- parse_obj.insert Java::opennlp.tools.parser.Parse.new(
30
- text.to_java(:String),
31
- Java::opennlp.tools.util.Span.new(start, start + tok.size),
32
- Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
33
- 0.to_java(:Double),
34
- i.to_java(:Integer)
35
- )
36
- end
37
-
38
- Parser::Parse.new(@j_instance.parse(parse_obj))
21
+ raise ArgumentError, 'passed text must be a String' unless text.is_a?(String)
22
+ text.empty? ? {} : parse_tokens(tokenizer.tokenize(text), text)
39
23
  end
40
24
 
41
25
  private
26
+
27
+ attr_reader :tokenizer
28
+
42
29
  def get_token_offset(text, tokens, index)
43
30
  offset = 0
31
+ return offset unless index > 0
44
32
 
45
33
  for i in (1..index) do
46
34
  offset = text.index tokens[i], offset + tokens[i - 1].size
47
- end if index > 0
48
-
35
+ end
49
36
  offset
50
37
  end
38
+
39
+ def build_parse_obj(text, span_start, span_end, type=Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability=1, token_index=0)
40
+ Java::opennlp.tools.parser.Parse.new(
41
+ text.to_java(:String),
42
+ Java::opennlp.tools.util.Span.new(span_start, span_end),
43
+ type.to_java(:String),
44
+ probability.to_java(:Double), # probability ?
45
+ token_index.to_java(:Integer) # the token index of the head of this parse
46
+ )
47
+ end
48
+
49
+ def parse_tokens(tokens, text)
50
+ parse_obj = build_parse_obj(text, 0, text.size)
51
+ parse_type = Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
52
+
53
+ tokens.each_with_index do |tok, i|
54
+ start = get_token_offset(text, tokens, i)
55
+ token_parse = build_parse_obj(text, start, start + tok.size, parse_type, 0, i)
56
+ parse_obj.insert(token_parse)
57
+ end
58
+
59
+ Parser::Parse.new(j_instance.parse(parse_obj))
60
+ end
51
61
  end
52
- end
62
+ end
@@ -13,27 +13,20 @@ module OpenNlp
13
13
  end
14
14
 
15
15
  def tree_bank_string
16
- span = j_instance.getSpan
17
- text = j_instance.getText
18
- type = j_instance.getType
19
- start = span.getStart
16
+ span, text, type, res = j_instance.getSpan, j_instance.getText, j_instance.getType, ''
17
+ start = span.getStart
20
18
 
21
- res = ''
19
+ res << "(#{type} " if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
22
20
 
23
- res << "(#{type} " unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
24
-
25
- j_instance.getChildren.each do |c|
26
- s = c.span
27
- res << text[start..s.getStart-1] if start < s.getStart
28
-
29
- subtree = self.class.new(c).tree_bank_string
30
- res << subtree if subtree
31
- start = s.getEnd
21
+ j_instance.getChildren.each do |child|
22
+ child_span = child.span
23
+ res << text[start..child_span.getStart-1] if start < child_span.getStart
24
+ res << self.class.new(child).tree_bank_string
25
+ start = child_span.getEnd
32
26
  end
33
27
 
34
28
  res << text[start..span.getEnd-1] if start < span.getEnd
35
-
36
- res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
29
+ res << ")" if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
37
30
 
38
31
  res
39
32
  end
@@ -41,14 +34,12 @@ module OpenNlp
41
34
  def code_tree
42
35
  kids = j_instance.getChildren
43
36
 
44
- kids.inject([]) do |acc,kid|
45
- data = {:type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString}
37
+ kids.each_with_object([]) do |kid, acc|
38
+ data = { :type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString }
46
39
  subtree = self.class.new(kid).code_tree
47
40
  data[:children] = subtree unless subtree.empty?
48
41
  acc << data
49
-
50
- acc
51
42
  end
52
43
  end
53
44
  end
54
- end
45
+ end
@@ -3,8 +3,11 @@ module OpenNlp
3
3
  self.java_class = Java::opennlp.tools.postag.POSTaggerME
4
4
 
5
5
  def tag(tokens)
6
- raise ArgumentError, "tokens must be an instance of String or Array" unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
- @j_instance.tag(tokens.to_java(:String))
6
+ unless (tokens.is_a?(Array) || tokens.is_a?(String))
7
+ fail ArgumentError, 'tokens must be an instance of String or Array'
8
+ end
9
+
10
+ j_instance.tag(tokens.to_java(:String))
8
11
  end
9
12
  end
10
13
  end
@@ -2,14 +2,24 @@ module OpenNlp
2
2
  class SentenceDetector < Tool
3
3
  self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
4
4
 
5
- def detect(string)
6
- raise ArgumentError, "string must be a String" unless string.is_a?(String)
7
- @j_instance.sentDetect(string).to_ary
5
+ # Detects sentences in a string
6
+ #
7
+ # @param [String] string string to detect sentences in
8
+ # @return [Array<String>] array of detected sentences
9
+ def detect(str)
10
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
11
+ j_instance.sentDetect(str).to_ary
8
12
  end
9
13
 
10
- def pos_detect(string)
11
- raise ArgumentError, "string must be a String" unless string.is_a?(String)
12
- @j_instance.sentPosDetect(string).map{|span| OpenNlp::Util::Span.new(span.getStart, span.getEnd)}
14
+ # Detects sentences in a string and returns array of spans
15
+ #
16
+ # @param [String] str
17
+ # @return [Array<OpenNlp::Util::Span>] array of spans for detected sentences
18
+ def pos_detect(str)
19
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
20
+ j_instance.sentPosDetect(str).map do |span|
21
+ OpenNlp::Util::Span.new(span.getStart, span.getEnd)
22
+ end
13
23
  end
14
24
  end
15
25
  end
@@ -2,14 +2,19 @@ module OpenNlp
2
2
  class Tokenizer < Tool
3
3
  self.java_class = Java::opennlp.tools.tokenize.TokenizerME
4
4
 
5
+ # Tokenizes a string
6
+ #
7
+ # @param [String] str string to tokenize
8
+ # @return [Array] array of string tokens
5
9
  def tokenize(str)
6
- raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
- @j_instance.tokenize(str).to_ary
10
+ fail ArgumentError, 'str must be a String' unless str.is_a?(String)
11
+ j_instance.tokenize(str).to_ary
8
12
  end
9
13
 
10
14
  private
15
+
11
16
  def get_last_probabilities
12
- @j_instance.getTokenProbabilities.to_ary
17
+ j_instance.getTokenProbabilities.to_ary
13
18
  end
14
19
  end
15
20
  end