open_nlp 0.0.7-java → 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -2
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile.lock +31 -0
- data/README.md +8 -1
- data/lib/open_nlp.rb +3 -3
- data/lib/open_nlp/categorizer.rb +7 -3
- data/lib/open_nlp/chunker.rb +19 -8
- data/lib/open_nlp/model.rb +13 -9
- data/lib/open_nlp/named_entity_detector.rb +6 -2
- data/lib/open_nlp/opennlp-maxent-3.0.3.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.3.jar +0 -0
- data/lib/open_nlp/parser.rb +43 -33
- data/lib/open_nlp/parser/parse.rb +12 -21
- data/lib/open_nlp/pos_tagger.rb +5 -2
- data/lib/open_nlp/sentence_detector.rb +16 -6
- data/lib/open_nlp/tokenizer.rb +8 -3
- data/lib/open_nlp/tool.rb +1 -1
- data/lib/open_nlp/util.rb +1 -2
- data/lib/open_nlp/util/span.rb +5 -5
- data/lib/open_nlp/version.rb +1 -1
- data/spec/categorizer_spec.rb +24 -22
- data/spec/chunker_spec.rb +29 -28
- data/spec/model/chunker_spec.rb +12 -15
- data/spec/model/detokenizer_spec.rb +11 -14
- data/spec/model/named_entity_detector_spec.rb +11 -14
- data/spec/model/pos_tagger_spec.rb +12 -15
- data/spec/model/sentence_detector_spec.rb +11 -14
- data/spec/model/tokenizer_spec.rb +11 -14
- data/spec/named_entity_detector_spec.rb +28 -27
- data/spec/parser/parse_spec.rb +64 -56
- data/spec/parser_spec.rb +26 -21
- data/spec/pos_tagger_spec.rb +22 -23
- data/spec/sentence_detector_spec.rb +39 -30
- data/spec/spec_helper.rb +1 -1
- data/spec/tokenizer_spec.rb +26 -22
- metadata +16 -17
- data/lib/open_nlp/opennlp-maxent-3.0.2-incubating.jar +0 -0
- data/lib/open_nlp/opennlp-tools-1.5.2-incubating.jar +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 57b451e151cf3a0ed7d28b9451f48b7ec7a31288
|
4
|
+
data.tar.gz: fc5e0b61f0baf6673a4a3dbc05e4c7828d2a935c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 420670f2e006071f47ba5d18f6152427ba49745b2647a7b5e7520acc987575ce02cc0e3cecbd3417f65d7d4c30dfe63740ac9c422a1f43d9a0ed5bb1b6a572a1
|
7
|
+
data.tar.gz: 059bf4f29726f027b634d802cb8f4ff1b1cb9b04ef6315a4d5e77d44098e1eed1a651edd97cc7d37a6ed4b91a6a6e03bb571aef45e996919fcfbec481ce5f269
|
data/.gitignore
CHANGED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
jruby-9.0.5.0
|
data/.travis.yml
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
|
5
|
+
GEM
|
6
|
+
remote: https://rubygems.org/
|
7
|
+
specs:
|
8
|
+
diff-lcs (1.2.5)
|
9
|
+
rspec (3.4.0)
|
10
|
+
rspec-core (~> 3.4.0)
|
11
|
+
rspec-expectations (~> 3.4.0)
|
12
|
+
rspec-mocks (~> 3.4.0)
|
13
|
+
rspec-core (3.4.4)
|
14
|
+
rspec-support (~> 3.4.0)
|
15
|
+
rspec-expectations (3.4.0)
|
16
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
17
|
+
rspec-support (~> 3.4.0)
|
18
|
+
rspec-mocks (3.4.1)
|
19
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
20
|
+
rspec-support (~> 3.4.0)
|
21
|
+
rspec-support (3.4.1)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
java
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
open_nlp!
|
28
|
+
rspec
|
29
|
+
|
30
|
+
BUNDLED WITH
|
31
|
+
1.11.2
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# OpenNlp
|
2
2
|
|
3
|
+
[](https://travis-ci.org/hck/open_nlp) [](https://codeclimate.com/github/hck/open_nlp)
|
4
|
+
|
3
5
|
A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
|
4
6
|
* sentence detection
|
5
7
|
* tokenize
|
@@ -35,8 +37,13 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
35
37
|
|
36
38
|
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
37
39
|
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
40
|
+
|
41
|
+
# get sentences as array of strings
|
38
42
|
sentence_detector.detect('The red fox sleeps soundly.')
|
39
43
|
|
44
|
+
# get array of OpenNLP::Util::Span objects:
|
45
|
+
sentence_detector.pos_detect('"The sky is blue. The Grass is green."')
|
46
|
+
|
40
47
|
### Tokenize
|
41
48
|
|
42
49
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
@@ -92,4 +99,4 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
92
99
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
93
100
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
94
101
|
4. Push to the branch (`git push origin my-new-feature`)
|
95
|
-
5. Create new Pull Request
|
102
|
+
5. Create new Pull Request
|
data/lib/open_nlp.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'open_nlp/version'
|
2
2
|
|
3
3
|
require 'java'
|
4
|
-
require 'open_nlp/opennlp-tools-1.5.
|
5
|
-
require 'open_nlp/opennlp-maxent-3.0.
|
4
|
+
require 'open_nlp/opennlp-tools-1.5.3.jar'
|
5
|
+
require 'open_nlp/opennlp-maxent-3.0.3.jar'
|
6
6
|
|
7
7
|
require 'open_nlp/java_class'
|
8
8
|
|
@@ -28,4 +28,4 @@ require 'open_nlp/sentence_detector'
|
|
28
28
|
require 'open_nlp/tokenizer'
|
29
29
|
require 'open_nlp/chunker'
|
30
30
|
require 'open_nlp/parser'
|
31
|
-
require 'open_nlp/parser/parse'
|
31
|
+
require 'open_nlp/parser/parse'
|
data/lib/open_nlp/categorizer.rb
CHANGED
@@ -2,11 +2,15 @@ module OpenNlp
|
|
2
2
|
class Categorizer < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
|
4
4
|
|
5
|
+
# Categorizes a string passed as parameter to one of the categories
|
6
|
+
#
|
7
|
+
# @param [String] str string to be categorized
|
8
|
+
# @return [String] category
|
5
9
|
def categorize(str)
|
6
|
-
|
10
|
+
fail ArgumentError, 'str param must be a String' unless str.is_a?(String)
|
7
11
|
|
8
|
-
outcomes =
|
9
|
-
|
12
|
+
outcomes = j_instance.categorize(str)
|
13
|
+
j_instance.getBestCategory(outcomes)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
data/lib/open_nlp/chunker.rb
CHANGED
@@ -5,27 +5,38 @@ module OpenNlp
|
|
5
5
|
def initialize(model, token_model, pos_model)
|
6
6
|
super(model)
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
unless token_model.is_a?(Model::Tokenizer)
|
9
|
+
fail ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model'
|
10
|
+
end
|
11
|
+
|
12
|
+
unless pos_model.is_a?(Model::POSTagger)
|
13
|
+
fail ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model'
|
14
|
+
end
|
10
15
|
|
11
16
|
@tokenizer = Tokenizer.new(token_model)
|
12
17
|
@pos_tagger = POSTagger.new(pos_model)
|
13
18
|
end
|
14
19
|
|
20
|
+
# Chunks a string into part-of-sentence pieces
|
21
|
+
#
|
22
|
+
# @param [String] str string to chunk
|
23
|
+
# @return [Array] array of chunks with part-of-sentence information
|
15
24
|
def chunk(str)
|
16
|
-
|
25
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
17
26
|
|
18
|
-
tokens =
|
19
|
-
pos_tags =
|
27
|
+
tokens = tokenizer.tokenize(str)
|
28
|
+
pos_tags = pos_tagger.tag(tokens).to_ary
|
20
29
|
|
21
|
-
chunks =
|
30
|
+
chunks = j_instance.chunk(tokens.to_java(:String), pos_tags.to_java(:String)).to_ary
|
22
31
|
|
23
32
|
build_chunks(chunks, tokens, pos_tags)
|
24
33
|
end
|
25
34
|
|
26
35
|
private
|
36
|
+
|
37
|
+
attr_reader :tokenizer, :pos_tagger
|
38
|
+
|
27
39
|
def build_chunks(chunks, tokens, pos_tags)
|
28
|
-
# data[i] = [token, pos_tag, chunk_val]
|
29
40
|
data = tokens.zip(pos_tags, chunks)
|
30
41
|
|
31
42
|
data.inject([]) do |acc, val|
|
@@ -45,7 +56,7 @@ module OpenNlp
|
|
45
56
|
end
|
46
57
|
|
47
58
|
def get_last_probabilities
|
48
|
-
|
59
|
+
j_instance.probs.to_ary
|
49
60
|
end
|
50
61
|
end
|
51
62
|
end
|
data/lib/open_nlp/model.rb
CHANGED
@@ -5,16 +5,20 @@ module OpenNlp
|
|
5
5
|
attr_reader :j_model
|
6
6
|
|
7
7
|
def initialize(model)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
java.io.FileInputStream.new(model)
|
13
|
-
else
|
14
|
-
raise ArgumentError, "Model must be either a string or a java.io.FileInputStream"
|
15
|
-
end
|
8
|
+
@j_model = self.class.java_class.new(model_stream(model))
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
16
12
|
|
17
|
-
|
13
|
+
def model_stream(model)
|
14
|
+
case model
|
15
|
+
when java.io.FileInputStream
|
16
|
+
model
|
17
|
+
when String
|
18
|
+
java.io.FileInputStream.new(model)
|
19
|
+
else
|
20
|
+
fail ArgumentError, 'Model must be either a string or a java.io.FileInputStream'
|
21
|
+
end
|
18
22
|
end
|
19
23
|
end
|
20
24
|
end
|
@@ -2,9 +2,13 @@ module OpenNlp
|
|
2
2
|
class NamedEntityDetector < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.namefind.NameFinderME
|
4
4
|
|
5
|
+
# Detects names for provided array of tokens
|
6
|
+
#
|
7
|
+
# @param [Array<String>] tokens tokens to run name detection on
|
8
|
+
# @return [Array<Java::opennlp.tools.util.Span>] names detected
|
5
9
|
def detect(tokens)
|
6
|
-
|
7
|
-
|
10
|
+
fail ArgumentError, 'tokens must be an instance of Array' unless tokens.is_a?(Array)
|
11
|
+
j_instance.find(tokens.to_java(:String)).to_ary
|
8
12
|
end
|
9
13
|
end
|
10
14
|
end
|
Binary file
|
Binary file
|
data/lib/open_nlp/parser.rb
CHANGED
@@ -1,52 +1,62 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Parser < Tool
|
3
|
-
def initialize(
|
4
|
-
|
5
|
-
|
3
|
+
def initialize(parser_model, token_model)
|
4
|
+
unless parser_model.is_a?(OpenNlp::Model)
|
5
|
+
fail ArgumentError, 'parser_model must be an OpenNlp::Model'
|
6
|
+
end
|
6
7
|
|
7
|
-
|
8
|
+
unless token_model.is_a?(Model::Tokenizer)
|
9
|
+
fail ArgumentError, 'token_model must be an OpenNlp::Tokenizer::Model'
|
10
|
+
end
|
8
11
|
|
12
|
+
@j_instance = Java::opennlp.tools.parser.ParserFactory.create(parser_model.j_model)
|
9
13
|
@tokenizer = Tokenizer.new(token_model)
|
10
14
|
end
|
11
15
|
|
16
|
+
# Parses text into instance of Parse class
|
17
|
+
#
|
18
|
+
# @param [String] text text to parse
|
19
|
+
# @return [OpenNlp::Parser::Parse]
|
12
20
|
def parse(text)
|
13
|
-
raise ArgumentError,
|
14
|
-
|
15
|
-
|
16
|
-
parse_obj = Java::opennlp.tools.parser.Parse.new(
|
17
|
-
text.to_java(:String),
|
18
|
-
Java::opennlp.tools.util.Span.new(0, text.size),
|
19
|
-
Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE.to_java(:String),
|
20
|
-
1.to_java(:Double), # probability ?
|
21
|
-
0.to_java(:Integer) # the token index of the head of this parse
|
22
|
-
)
|
23
|
-
|
24
|
-
tokens = @tokenizer.tokenize(text)
|
25
|
-
|
26
|
-
tokens.each_with_index do |tok, i|
|
27
|
-
start = get_token_offset text, tokens, i
|
28
|
-
|
29
|
-
parse_obj.insert Java::opennlp.tools.parser.Parse.new(
|
30
|
-
text.to_java(:String),
|
31
|
-
Java::opennlp.tools.util.Span.new(start, start + tok.size),
|
32
|
-
Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE.to_java(:String),
|
33
|
-
0.to_java(:Double),
|
34
|
-
i.to_java(:Integer)
|
35
|
-
)
|
36
|
-
end
|
37
|
-
|
38
|
-
Parser::Parse.new(@j_instance.parse(parse_obj))
|
21
|
+
raise ArgumentError, 'passed text must be a String' unless text.is_a?(String)
|
22
|
+
text.empty? ? {} : parse_tokens(tokenizer.tokenize(text), text)
|
39
23
|
end
|
40
24
|
|
41
25
|
private
|
26
|
+
|
27
|
+
attr_reader :tokenizer
|
28
|
+
|
42
29
|
def get_token_offset(text, tokens, index)
|
43
30
|
offset = 0
|
31
|
+
return offset unless index > 0
|
44
32
|
|
45
33
|
for i in (1..index) do
|
46
34
|
offset = text.index tokens[i], offset + tokens[i - 1].size
|
47
|
-
end
|
48
|
-
|
35
|
+
end
|
49
36
|
offset
|
50
37
|
end
|
38
|
+
|
39
|
+
def build_parse_obj(text, span_start, span_end, type=Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability=1, token_index=0)
|
40
|
+
Java::opennlp.tools.parser.Parse.new(
|
41
|
+
text.to_java(:String),
|
42
|
+
Java::opennlp.tools.util.Span.new(span_start, span_end),
|
43
|
+
type.to_java(:String),
|
44
|
+
probability.to_java(:Double), # probability ?
|
45
|
+
token_index.to_java(:Integer) # the token index of the head of this parse
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse_tokens(tokens, text)
|
50
|
+
parse_obj = build_parse_obj(text, 0, text.size)
|
51
|
+
parse_type = Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
52
|
+
|
53
|
+
tokens.each_with_index do |tok, i|
|
54
|
+
start = get_token_offset(text, tokens, i)
|
55
|
+
token_parse = build_parse_obj(text, start, start + tok.size, parse_type, 0, i)
|
56
|
+
parse_obj.insert(token_parse)
|
57
|
+
end
|
58
|
+
|
59
|
+
Parser::Parse.new(j_instance.parse(parse_obj))
|
60
|
+
end
|
51
61
|
end
|
52
|
-
end
|
62
|
+
end
|
@@ -13,27 +13,20 @@ module OpenNlp
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def tree_bank_string
|
16
|
-
span = j_instance.getSpan
|
17
|
-
|
18
|
-
type = j_instance.getType
|
19
|
-
start = span.getStart
|
16
|
+
span, text, type, res = j_instance.getSpan, j_instance.getText, j_instance.getType, ''
|
17
|
+
start = span.getStart
|
20
18
|
|
21
|
-
res
|
19
|
+
res << "(#{type} " if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
22
20
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
subtree = self.class.new(c).tree_bank_string
|
30
|
-
res << subtree if subtree
|
31
|
-
start = s.getEnd
|
21
|
+
j_instance.getChildren.each do |child|
|
22
|
+
child_span = child.span
|
23
|
+
res << text[start..child_span.getStart-1] if start < child_span.getStart
|
24
|
+
res << self.class.new(child).tree_bank_string
|
25
|
+
start = child_span.getEnd
|
32
26
|
end
|
33
27
|
|
34
28
|
res << text[start..span.getEnd-1] if start < span.getEnd
|
35
|
-
|
36
|
-
res << ")" unless type == Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
29
|
+
res << ")" if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
37
30
|
|
38
31
|
res
|
39
32
|
end
|
@@ -41,14 +34,12 @@ module OpenNlp
|
|
41
34
|
def code_tree
|
42
35
|
kids = j_instance.getChildren
|
43
36
|
|
44
|
-
kids.
|
45
|
-
data
|
37
|
+
kids.each_with_object([]) do |kid, acc|
|
38
|
+
data = { :type => kid.getType, :parent_type => self.j_instance.getType, :token => kid.toString }
|
46
39
|
subtree = self.class.new(kid).code_tree
|
47
40
|
data[:children] = subtree unless subtree.empty?
|
48
41
|
acc << data
|
49
|
-
|
50
|
-
acc
|
51
42
|
end
|
52
43
|
end
|
53
44
|
end
|
54
|
-
end
|
45
|
+
end
|
data/lib/open_nlp/pos_tagger.rb
CHANGED
@@ -3,8 +3,11 @@ module OpenNlp
|
|
3
3
|
self.java_class = Java::opennlp.tools.postag.POSTaggerME
|
4
4
|
|
5
5
|
def tag(tokens)
|
6
|
-
|
7
|
-
|
6
|
+
unless (tokens.is_a?(Array) || tokens.is_a?(String))
|
7
|
+
fail ArgumentError, 'tokens must be an instance of String or Array'
|
8
|
+
end
|
9
|
+
|
10
|
+
j_instance.tag(tokens.to_java(:String))
|
8
11
|
end
|
9
12
|
end
|
10
13
|
end
|
@@ -2,14 +2,24 @@ module OpenNlp
|
|
2
2
|
class SentenceDetector < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.sentdetect.SentenceDetectorME
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
# Detects sentences in a string
|
6
|
+
#
|
7
|
+
# @param [String] string string to detect sentences in
|
8
|
+
# @return [Array<String>] array of detected sentences
|
9
|
+
def detect(str)
|
10
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
j_instance.sentDetect(str).to_ary
|
8
12
|
end
|
9
13
|
|
10
|
-
|
11
|
-
|
12
|
-
|
14
|
+
# Detects sentences in a string and returns array of spans
|
15
|
+
#
|
16
|
+
# @param [String] str
|
17
|
+
# @return [Array<OpenNlp::Util::Span>] array of spans for detected sentences
|
18
|
+
def pos_detect(str)
|
19
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
20
|
+
j_instance.sentPosDetect(str).map do |span|
|
21
|
+
OpenNlp::Util::Span.new(span.getStart, span.getEnd)
|
22
|
+
end
|
13
23
|
end
|
14
24
|
end
|
15
25
|
end
|
data/lib/open_nlp/tokenizer.rb
CHANGED
@@ -2,14 +2,19 @@ module OpenNlp
|
|
2
2
|
class Tokenizer < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.tokenize.TokenizerME
|
4
4
|
|
5
|
+
# Tokenizes a string
|
6
|
+
#
|
7
|
+
# @param [String] str string to tokenize
|
8
|
+
# @return [Array] array of string tokens
|
5
9
|
def tokenize(str)
|
6
|
-
|
7
|
-
|
10
|
+
fail ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
j_instance.tokenize(str).to_ary
|
8
12
|
end
|
9
13
|
|
10
14
|
private
|
15
|
+
|
11
16
|
def get_last_probabilities
|
12
|
-
|
17
|
+
j_instance.getTokenProbabilities.to_ary
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|