open_nlp 0.0.7-java → 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -2
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +31 -0
- data/README.md +8 -1
- data/lib/open_nlp.rb +3 -3
- data/lib/open_nlp/categorizer.rb +7 -3
- data/lib/open_nlp/chunker.rb +19 -8
- data/lib/open_nlp/model.rb +13 -9
- data/lib/open_nlp/named_entity_detector.rb +6 -2
- data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
- data/lib/open_nlp/parser.rb +43 -33
- data/lib/open_nlp/parser/parse.rb +12 -21
- data/lib/open_nlp/pos_tagger.rb +5 -2
- data/lib/open_nlp/sentence_detector.rb +16 -6
- data/lib/open_nlp/tokenizer.rb +8 -3
- data/lib/open_nlp/tool.rb +1 -1
- data/lib/open_nlp/util.rb +1 -2
- data/lib/open_nlp/util/span.rb +5 -5
- data/lib/open_nlp/version.rb +1 -1
- data/spec/categorizer_spec.rb +24 -22
- data/spec/chunker_spec.rb +29 -28
- data/spec/model/chunker_spec.rb +12 -15
- data/spec/model/detokenizer_spec.rb +11 -14
- data/spec/model/named_entity_detector_spec.rb +11 -14
- data/spec/model/pos_tagger_spec.rb +12 -15
- data/spec/model/sentence_detector_spec.rb +11 -14
- data/spec/model/tokenizer_spec.rb +11 -14
- data/spec/named_entity_detector_spec.rb +28 -27
- data/spec/parser/parse_spec.rb +64 -56
- data/spec/parser_spec.rb +26 -21
- data/spec/pos_tagger_spec.rb +22 -23
- data/spec/sentence_detector_spec.rb +39 -30
- data/spec/spec_helper.rb +1 -1
- data/spec/tokenizer_spec.rb +26 -22
- metadata +16 -17
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 57b451e151cf3a0ed7d28b9451f48b7ec7a31288
|
4
|
+
data.tar.gz: fc5e0b61f0baf6673a4a3dbc05e4c7828d2a935c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 420670f2e006071f47ba5d18f6152427ba49745b2647a7b5e7520acc987575ce02cc0e3cecbd3417f65d7d4c30dfe63740ac9c422a1f43d9a0ed5bb1b6a572a1
|
7
|
+
data.tar.gz: 059bf4f29726f027b634d802cb8f4ff1b1cb9b04ef6315a4d5e77d44098e1eed1a651edd97cc7d37a6ed4b91a6a6e03bb571aef45e996919fcfbec481ce5f269
|
data/.gitignore
CHANGED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
jruby-9.0.5.0
|
data/.travis.yml
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
|
5
|
+
GEM
|
6
|
+
remote: https://rubygems.org/
|
7
|
+
specs:
|
8
|
+
diff-lcs (1.2.5)
|
9
|
+
rspec (3.4.0)
|
10
|
+
rspec-core (~> 3.4.0)
|
11
|
+
rspec-expectations (~> 3.4.0)
|
12
|
+
rspec-mocks (~> 3.4.0)
|
13
|
+
rspec-core (3.4.4)
|
14
|
+
rspec-support (~> 3.4.0)
|
15
|
+
rspec-expectations (3.4.0)
|
16
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
17
|
+
rspec-support (~> 3.4.0)
|
18
|
+
rspec-mocks (3.4.1)
|
19
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
20
|
+
rspec-support (~> 3.4.0)
|
21
|
+
rspec-support (3.4.1)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
java
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
open_nlp!
|
28
|
+
rspec
|
29
|
+
|
30
|
+
BUNDLED WITH
|
31
|
+
1.11.2
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# OpenNlp
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/hck/open_nlp.png?branch=master)](https://travis-ci.org/hck/open_nlp) [![Code Climate](https://codeclimate.com/github/hck/open_nlp.png)](https://codeclimate.com/github/hck/open_nlp)
|
4
|
+
|
3
5
|
A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
|
4
6
|
* sentence detection
|
5
7
|
* tokenize
|
@@ -35,8 +37,13 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
35
37
|
|
36
38
|
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
37
39
|
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
40
|
+
|
41
|
+
# get sentences as array of strings
|
38
42
|
sentence_detector.detect('The red fox sleeps soundly.')
|
39
43
|
|
44
|
+
# get array of OpenNLP::Util::Span objects:
|
45
|
+
sentence_detector.pos_detect('"The sky is blue. The Grass is green."')
|
46
|
+
|
40
47
|
### Tokenize
|
41
48
|
|
42
49
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
@@ -92,4 +99,4 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
92
99
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
93
100
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
94
101
|
4. Push to the branch (`git push origin my-new-feature`)
|
95
|
-
5. Create new Pull Request
|
102
|
+
5. Create new Pull Request
|
data/lib/open_nlp.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'open_nlp/version'
|
2
2
|
|
3
3
|
require 'java'
|
4
|
-
require 'open_nlp/opennlp-tools-1.5.
|
5
|
-
require 'open_nlp/opennlp-maxent-3.0.
|
4
|
+
require 'open_nlp/opennlp-tools-1.5.3.jar'
|
5
|
+
require 'open_nlp/opennlp-maxent-3.0.3.jar'
|
6
6
|
|
7
7
|
require 'open_nlp/java_class'
|
8
8
|
|
@@ -28,4 +28,4 @@ require 'open_nlp/sentence_detector'
|
|
28
28
|
require 'open_nlp/tokenizer'
|
29
29
|
require 'open_nlp/chunker'
|
30
30
|
require 'open_nlp/parser'
|
31
|
-
require 'open_nlp/parser/parse'
|
31
|
+
require 'open_nlp/parser/parse'
|
data/lib/open_nlp/categorizer.rb
CHANGED
@@ -2,11 +2,15 @@ module OpenNlp
|
|
2
2
|
class Categorizer < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
|
4
4
|
|
5
|
+
# Categorizes a string passed as parameter to one of the categories
|
6
|
+
#
|
7
|
+
# @param [String] str string to be categorized
|
8
|
+
# @return [String] category
|
5
9
|
def categorize(str)
|
6
|
-
|
10
|
+
fail ArgumentError, 'str param must be a String' unless str.is_a?(String)
|
7
11
|
|
8
|
-
outcomes =
|
9
|
-
|
12
|
+
outcomes = j_instance.categorize(str)
|
13
|
+
j_instance.getBestCategory(outcomes)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
data/lib/open_nlp/chunker.rb
CHANGED
@@ -5,27 +5,38 @@ module OpenNlp
|
|
5
5
|
def initialize(model, token_model, pos_model)
|
6
6
|
super(model)
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
unless token_model.is_a?(Model::Tokenizer)
|
9
|
+
fail ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model'
|
10
|
+
end
|
11
|
+
|
12
|
+
unless pos_model.is_a?(Model::POSTagger)
|
13
|
+
fail ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model'
|
14
|
+
end
|
10
15
|
|
11
16
|
@tokenizer = Tokenizer.new(token_model)
|
12
17
|
@pos_tagger = POSTagger.new(pos_model)
|
13
18
|
end
|
14
19
|
|
20
|
+
# Chunks a string into part-of-sentence pieces
|
21
|
+
#
|
22
|
+
# @param [String] str string to chunk
|
23
|
+
# @return [Array] array of chunks with part-of-sentence information
|
15
24
|
def chunk(str)
|
16
|
-
|
25
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
17
26
|
|
18
|
-
tokens =
|
19
|
-
pos_tags =
|
27
|
+
tokens = tokenizer.tokenize(str)
|
28
|
+
pos_tags = pos_tagger.tag(tokens).to_ary
|
20
29
|
|
21
|
-
chunks =
|
30
|
+
chunks = j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
|
22
31
|
|
23
32
|
build_chunks(chunks, tokens, pos_tags)
|
24
33
|
end
|
25
34
|
|
26
35
|
private
|
36
|
+
|
37
|
+
attr_reader :tokenizer, :pos_tagger
|
38
|
+
|
27
39
|
def build_chunks(chunks, tokens, pos_tags)
|
28
|
-
# data[i] = [token, pos_tag, chunk_val]
|
29
40
|
data = tokens.zip(pos_tags, chunks)
|
30
41
|
|
31
42
|
data.inject([]) do |acc, val|
|
@@ -45,7 +56,7 @@ module OpenNlp
|
|
45
56
|
end
|
46
57
|
|
47
58
|
def get_last_probabilities
|
48
|
-
|
59
|
+
j_instance.probs.to_ary
|
49
60
|
end
|
50
61
|
end
|
51
62
|
end
|
data/lib/open_nlp/model.rb
CHANGED
@@ -5,16 +5,20 @@ module OpenNlp
|
|
5
5
|
attr_reader :j_model
|
6
6
|
|
7
7
|
def initialize(model)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
java.io.FileInputStream.new(model)
|
13
|
-
else
|
14
|
-
raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
|
15
|
-
end
|
8
|
+
@j_model = self.class.java_class.new(model_stream(model))
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
16
12
|
|
17
|
-
|
13
|
+
def model_stream(model)
|
14
|
+
case model
|
15
|
+
when java.io.FileInputStream
|
16
|
+
model
|
17
|
+
when String
|
18
|
+
java.io.FileInputStream.new(model)
|
19
|
+
else
|
20
|
+
fail ArgumentError, 'Model must be either a string or a java.io.FileInputStream'
|
21
|
+
end
|
18
22
|
end
|
19
23
|
end
|
20
24
|
end
|
@@ -2,9 +2,13 @@ module OpenNlp
|
|
2
2
|
class NamedEntityDetector < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.namefind.NameFinderME
|
4
4
|
|
5
|
+
# Detects names for provided array of tokens
|
6
|
+
#
|
7
|
+
# @param [Array<String>] tokens tokens to run name detection on
|
8
|
+
# @return [Array<Java::opennlp.tools.util.Span>] names detected
|
5
9
|
def detect(tokens)
|
6
|
-
|
7
|
-
|
10
|
+
fail ArgumentError, 'tokens must be an instance of Array' unless tokens.is_a?(Array)
|
11
|
+
j_instance.find(tokens.to_java(:String)).to_ary
|
8
12
|
end
|
9
13
|
end
|
10
14
|
end
|
Binary file
|
Binary file
|
data/lib/open_nlp/parser.rb
CHANGED
@@ -1,52 +1,62 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Parser < Tool
|
3
|
-
def initialize(
|
4
|
-
|
5
|
-
|
3
|
+
def initialize(parser_model, token_model)
|
4
|
+
unless parser_model.is_a?(OpenNlp::Model)
|
5
|
+
fail ArgumentError, 'parser_model must be an OpenNlp::Model'
|
6
|
+
end
|
6
7
|
|
7
|
-
|
8
|
+
unless token_model.is_a?(Model::Tokenizer)
|
9
|
+
fail ArgumentError, 'token_model must be an OpenNlp::Tokenizer::Model'
|
10
|
+
end
|
8
11
|
|
12
|
+
@j_instance = Java::opennlp.tools.parser.ParserFactory.create(parser_model.j_model)
|
9
13
|
@tokenizer = Tokenizer.new(token_model)
|
10
14
|
end
|
11
15
|
|
16
|
+
# Parses text into instance of Parse class
|
17
|
+
#
|
18
|
+
# @param [String] text text to parse
|
19
|
+
# @return [OpenNlp::Parser::Parse]
|
12
20
|
def parse(text)
|
13
|
-
raise ArgumentError,
|
14
|
-
|
15
|
-
|
16
|
-
parse_obj = Java::opennlp.tools.parser.Parse.new(
|
17
|
-
text.to_java(:String),
|
18
|
-
Java::opennlp.tools.util.Span.new(0, text.size),
|
19
|
-
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
20
|
-
1.to_java(:Double), # probability ?
|
21
|
-
0.to_java(:Integer) # the token index of the head of this parse
|
22
|
-
)
|
23
|
-
|
24
|
-
tokens = @tokenizer.tokenize(text)
|
25
|
-
|
26
|
-
tokens.each_with_index do |tok, i|
|
27
|
-
start = get_token_offset text, tokens, i
|
28
|
-
|
29
|
-
parse_obj.insert Java::opennlp.tools.parser.Parse.new(
|
30
|
-
text.to_java(:String),
|
31
|
-
Java::opennlp.tools.util.Span.new(start, start + tok.size),
|
32
|
-
Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
|
33
|
-
0.to_java(:Double),
|
34
|
-
i.to_java(:Integer)
|
35
|
-
)
|
36
|
-
end
|
37
|
-
|
38
|
-
Parser::Parse.new(@j_instance.parse(parse_obj))
|
21
|
+
raise ArgumentError, 'passed text must be a String' unless text.is_a?(String)
|
22
|
+
text.empty? ? {} : parse_tokens(tokenizer.tokenize(text), text)
|
39
23
|
end
|
40
24
|
|
41
25
|
private
|
26
|
+
|
27
|
+
attr_reader :tokenizer
|
28
|
+
|
42
29
|
def get_token_offset(text, tokens, index)
|
43
30
|
offset = 0
|
31
|
+
return offset unless index > 0
|
44
32
|
|
45
33
|
for i in (1..index) do
|
46
34
|
offset = text.index tokens[i], offset + tokens[i - 1].size
|
47
|
-
end
|
48
|
-
|
35
|
+
end
|
49
36
|
offset
|
50
37
|
end
|
38
|
+
|
39
|
+
def build_parse_obj(text, span_start, span_end, type=Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability=1, token_index=0)
|
40
|
+
Java::opennlp.tools.parser.Parse.new(
|
41
|
+
text.to_java(:String),
|
42
|
+
Java::opennlp.tools.util.Span.new(span_start, span_end),
|
43
|
+
type.to_java(:String),
|
44
|
+
probability.to_java(:Double), # probability ?
|
45
|
+
token_index.to_java(:Integer) # the token index of the head of this parse
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_tokens(tokens, text)
|
50
|
+
parse_obj = build_parse_obj(text, 0, text.size)
|
51
|
+
parse_type = Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
52
|
+
|
53
|
+
tokens.each_with_index do |tok, i|
|
54
|
+
start = get_token_offset(text, tokens, i)
|
55
|
+
token_parse = build_parse_obj(text, start, start + tok.size, parse_type, 0, i)
|
56
|
+
parse_obj.insert(token_parse)
|
57
|
+
end
|
58
|
+
|
59
|
+
Parser::Parse.new(j_instance.parse(parse_obj))
|
60
|
+
end
|
51
61
|
end
|
52
|
-
end
|
62
|
+
end
|
@@ -13,27 +13,20 @@ module OpenNlp
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def tree_bank_string
|
16
|
-
span = j_instance.getSpan
|
17
|
-
|
18
|
-
type = j_instance.getType
|
19
|
-
start = span.getStart
|
16
|
+
span, text, type, res = j_instance.getSpan, j_instance.getText, j_instance.getType, ''
|
17
|
+
start = span.getStart
|
20
18
|
|
21
|
-
res
|
19
|
+
res << "(#{type} " if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
22
20
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
subtree = self.class.new(c).tree_bank_string
|
30
|
-
res << subtree if subtree
|
31
|
-
start = s.getEnd
|
21
|
+
j_instance.getChildren.each do |child|
|
22
|
+
child_span = child.span
|
23
|
+
res << text[start..child_span.getStart-1] if start < child_span.getStart
|
24
|
+
res << self.class.new(child).tree_bank_string
|
25
|
+
start = child_span.getEnd
|
32
26
|
end
|
33
27
|
|
34
28
|
res << text[start..span.getEnd-1] if start < span.getEnd
|
35
|
-
|
36
|
-
res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
29
|
+
res << ")" if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
37
30
|
|
38
31
|
res
|
39
32
|
end
|
@@ -41,14 +34,12 @@ module OpenNlp
|
|
41
34
|
def code_tree
|
42
35
|
kids = j_instance.getChildren
|
43
36
|
|
44
|
-
kids.
|
45
|
-
data
|
37
|
+
kids.each_with_object([]) do |kid, acc|
|
38
|
+
data = { :type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString }
|
46
39
|
subtree = self.class.new(kid).code_tree
|
47
40
|
data[:children] = subtree unless subtree.empty?
|
48
41
|
acc << data
|
49
|
-
|
50
|
-
acc
|
51
42
|
end
|
52
43
|
end
|
53
44
|
end
|
54
|
-
end
|
45
|
+
end
|
data/lib/open_nlp/pos_tagger.rb
CHANGED
@@ -3,8 +3,11 @@ module OpenNlp
|
|
3
3
|
self.java_class = Java::opennlp.tools.postag.POSTaggerME
|
4
4
|
|
5
5
|
def tag(tokens)
|
6
|
-
|
7
|
-
|
6
|
+
unless (tokens.is_a?(Array) || tokens.is_a?(String))
|
7
|
+
fail ArgumentError, 'tokens must be an instance of String or Array'
|
8
|
+
end
|
9
|
+
|
10
|
+
j_instance.tag(tokens.to_java(:String))
|
8
11
|
end
|
9
12
|
end
|
10
13
|
end
|
@@ -2,14 +2,24 @@ module OpenNlp
|
|
2
2
|
class SentenceDetector < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
# Detects sentences in a string
|
6
|
+
#
|
7
|
+
# @param [String] string string to detect sentences in
|
8
|
+
# @return [Array<String>] array of detected sentences
|
9
|
+
def detect(str)
|
10
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
j_instance.sentDetect(str).to_ary
|
8
12
|
end
|
9
13
|
|
10
|
-
|
11
|
-
|
12
|
-
|
14
|
+
# Detects sentences in a string and returns array of spans
|
15
|
+
#
|
16
|
+
# @param [String] str
|
17
|
+
# @return [Array<OpenNlp::Util::Span>] array of spans for detected sentences
|
18
|
+
def pos_detect(str)
|
19
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
20
|
+
j_instance.sentPosDetect(str).map do |span|
|
21
|
+
OpenNlp::Util::Span.new(span.getStart, span.getEnd)
|
22
|
+
end
|
13
23
|
end
|
14
24
|
end
|
15
25
|
end
|
data/lib/open_nlp/tokenizer.rb
CHANGED
@@ -2,14 +2,19 @@ module OpenNlp
|
|
2
2
|
class Tokenizer < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.tokenize.TokenizerME
|
4
4
|
|
5
|
+
# Tokenizes a string
|
6
|
+
#
|
7
|
+
# @param [String] str string to tokenize
|
8
|
+
# @return [Array] array of string tokens
|
5
9
|
def tokenize(str)
|
6
|
-
|
7
|
-
|
10
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
j_instance.tokenize(str).to_ary
|
8
12
|
end
|
9
13
|
|
10
14
|
private
|
15
|
+
|
11
16
|
def get_last_probabilities
|
12
|
-
|
17
|
+
j_instance.getTokenProbabilities.to_ary
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|