open_nlp 0.2.0-java → 0.3.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +1 -0
- data/.rubocop.yml +24 -0
- data/.ruby-version +1 -1
- data/.travis.yml +1 -3
- data/Gemfile +2 -1
- data/Gemfile.lock +32 -13
- data/Rakefile +1 -1
- data/lib/open_nlp/categorizer.rb +2 -2
- data/lib/open_nlp/chunker.rb +21 -20
- data/lib/open_nlp/java_class.rb +2 -8
- data/lib/open_nlp/model.rb +4 -1
- data/lib/open_nlp/model/categorizer.rb +1 -1
- data/lib/open_nlp/model/chunker.rb +1 -1
- data/lib/open_nlp/model/detokenizer.rb +1 -1
- data/lib/open_nlp/model/parser.rb +1 -1
- data/lib/open_nlp/model/pos_tagger.rb +1 -1
- data/lib/open_nlp/model/sentence_detector.rb +1 -1
- data/lib/open_nlp/model/tokenizer.rb +1 -1
- data/lib/open_nlp/named_entity_detector.rb +2 -1
- data/lib/open_nlp/parser.rb +13 -12
- data/lib/open_nlp/parser/parse.rb +28 -7
- data/lib/open_nlp/pos_tagger.rb +6 -3
- data/lib/open_nlp/sentence_detector.rb +5 -3
- data/lib/open_nlp/tokenizer.rb +3 -2
- data/lib/open_nlp/tool.rb +6 -2
- data/lib/open_nlp/util/span.rb +25 -9
- data/lib/open_nlp/version.rb +1 -1
- data/open_nlp.gemspec +10 -11
- data/spec/model/tokenizer_spec.rb +1 -1
- data/spec/named_entity_detector_spec.rb +1 -1
- data/spec/parser/parse_spec.rb +40 -40
- data/spec/parser_spec.rb +2 -2
- data/spec/pos_tagger_spec.rb +2 -2
- data/spec/spec_helper.rb +25 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 611fa39e5357a43ac259dc113b299aede41e5379a1912dcd0bb32120da05356d
|
4
|
+
data.tar.gz: 0ef94231098429a66a11f8d3ab5dc5708156fcc425caa5ab4e29278f5233ab1f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e5fc1bbbd6059d818bbe3abc9408f497b845dd6e4a314b19a9d47a8e85a3435cdbe8d6c9fd2610f0a643a26974d4b20a0fc995c6660549149f9acaa688d3f52
|
7
|
+
data.tar.gz: c416e14a29512f1935a00289625ea162fac34f8dad4d320a8c8cf940253a6f9ac5c845e07bda12deca699ec5da809bbc733d9e392795e2895faa78e0d826a76c
|
data/.gitignore
CHANGED
data/.rspec
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Metrics/AbcSize:
|
2
|
+
Max: 41
|
3
|
+
|
4
|
+
Metrics/BlockLength:
|
5
|
+
Exclude:
|
6
|
+
- spec/**/*.rb
|
7
|
+
|
8
|
+
Metrics/LineLength:
|
9
|
+
Max: 153
|
10
|
+
|
11
|
+
Metrics/MethodLength:
|
12
|
+
Max: 15
|
13
|
+
|
14
|
+
Metrics/ParameterLists:
|
15
|
+
Max: 6
|
16
|
+
|
17
|
+
Style/ClassAndModuleChildren:
|
18
|
+
Enabled: false
|
19
|
+
|
20
|
+
Style/ColonMethodCall:
|
21
|
+
Enabled: false
|
22
|
+
|
23
|
+
Style/Documentation:
|
24
|
+
Enabled: false
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
jruby-9.
|
1
|
+
jruby-9.2.4.0
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,24 +1,42 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
+
open_nlp (0.3.0-java)
|
4
5
|
|
5
6
|
GEM
|
6
7
|
remote: https://rubygems.org/
|
7
8
|
specs:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
9
|
+
ast (2.4.0)
|
10
|
+
diff-lcs (1.3)
|
11
|
+
jaro_winkler (1.5.1-java)
|
12
|
+
parallel (1.12.1)
|
13
|
+
parser (2.5.3.0)
|
14
|
+
ast (~> 2.4.0)
|
15
|
+
powerpack (0.1.2)
|
16
|
+
rainbow (3.0.0)
|
17
|
+
rspec (3.8.0)
|
18
|
+
rspec-core (~> 3.8.0)
|
19
|
+
rspec-expectations (~> 3.8.0)
|
20
|
+
rspec-mocks (~> 3.8.0)
|
21
|
+
rspec-core (3.8.0)
|
22
|
+
rspec-support (~> 3.8.0)
|
23
|
+
rspec-expectations (3.8.2)
|
16
24
|
diff-lcs (>= 1.2.0, < 2.0)
|
17
|
-
rspec-support (~> 3.
|
18
|
-
rspec-mocks (3.
|
25
|
+
rspec-support (~> 3.8.0)
|
26
|
+
rspec-mocks (3.8.0)
|
19
27
|
diff-lcs (>= 1.2.0, < 2.0)
|
20
|
-
rspec-support (~> 3.
|
21
|
-
rspec-support (3.
|
28
|
+
rspec-support (~> 3.8.0)
|
29
|
+
rspec-support (3.8.0)
|
30
|
+
rubocop (0.60.0)
|
31
|
+
jaro_winkler (~> 1.5.1)
|
32
|
+
parallel (~> 1.10)
|
33
|
+
parser (>= 2.5, != 2.5.1.1)
|
34
|
+
powerpack (~> 0.1)
|
35
|
+
rainbow (>= 2.2.2, < 4.0)
|
36
|
+
ruby-progressbar (~> 1.7)
|
37
|
+
unicode-display_width (~> 1.4.0)
|
38
|
+
ruby-progressbar (1.10.0)
|
39
|
+
unicode-display_width (1.4.0)
|
22
40
|
|
23
41
|
PLATFORMS
|
24
42
|
java
|
@@ -26,6 +44,7 @@ PLATFORMS
|
|
26
44
|
DEPENDENCIES
|
27
45
|
open_nlp!
|
28
46
|
rspec
|
47
|
+
rubocop
|
29
48
|
|
30
49
|
BUNDLED WITH
|
31
|
-
1.
|
50
|
+
1.17.1
|
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
data/lib/open_nlp/categorizer.rb
CHANGED
@@ -7,10 +7,10 @@ module OpenNlp
|
|
7
7
|
# @param [String] str string to be categorized
|
8
8
|
# @return [String] category
|
9
9
|
def categorize(str)
|
10
|
-
|
10
|
+
raise ArgumentError, 'str param must be a String' unless str.is_a?(String)
|
11
11
|
|
12
12
|
outcomes = j_instance.categorize(str)
|
13
13
|
j_instance.getBestCategory(outcomes)
|
14
14
|
end
|
15
15
|
end
|
16
|
-
end
|
16
|
+
end
|
data/lib/open_nlp/chunker.rb
CHANGED
@@ -2,16 +2,19 @@ module OpenNlp
|
|
2
2
|
class Chunker < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.chunker.ChunkerME
|
4
4
|
|
5
|
+
# Initializes new instance of Chunker
|
6
|
+
#
|
7
|
+
# @param [OpenNlp::Model] model chunker model
|
8
|
+
# @param [Model::Tokenizer] token_model tokenizer model
|
9
|
+
# @param [Model::POSTagger] pos_model part-of-speech tagging model
|
5
10
|
def initialize(model, token_model, pos_model)
|
6
11
|
super(model)
|
7
12
|
|
8
|
-
|
9
|
-
|
10
|
-
end
|
13
|
+
token_model.is_a?(Model::Tokenizer) ||
|
14
|
+
raise(ArgumentError, 'token model must be an OpenNlp::Tokenizer::Model')
|
11
15
|
|
12
|
-
|
13
|
-
|
14
|
-
end
|
16
|
+
pos_model.is_a?(Model::POSTagger) ||
|
17
|
+
raise(ArgumentError, 'pos model must be an OpenNlp::POSTagger::Model')
|
15
18
|
|
16
19
|
@tokenizer = Tokenizer.new(token_model)
|
17
20
|
@pos_tagger = POSTagger.new(pos_model)
|
@@ -22,7 +25,7 @@ module OpenNlp
|
|
22
25
|
# @param [String] str string to chunk
|
23
26
|
# @return [Array] array of chunks with part-of-sentence information
|
24
27
|
def chunk(str)
|
25
|
-
|
28
|
+
raise ArgumentError, 'str must be a String' unless str.is_a?(String)
|
26
29
|
|
27
30
|
tokens = tokenizer.tokenize(str)
|
28
31
|
pos_tags = pos_tagger.tag(tokens).to_ary
|
@@ -39,24 +42,22 @@ module OpenNlp
|
|
39
42
|
def build_chunks(chunks, tokens, pos_tags)
|
40
43
|
data = tokens.zip(pos_tags, chunks)
|
41
44
|
|
42
|
-
data.
|
45
|
+
data.each_with_object([]) do |val, acc|
|
43
46
|
chunk = val[2]
|
44
|
-
acc << [{val[0] => val[1]}] if chunk[0] == 'B' # add token to chunk if it is a start of chunk
|
45
|
-
|
46
|
-
if chunk[0]
|
47
|
-
if acc.last
|
48
|
-
acc.last << {val[0] => val[1]} # add token to chunk if it is a continuation of chunk
|
49
|
-
else
|
50
|
-
acc << [{val[0] => val[1]}] # add token to new chunk if no chunks exists
|
51
|
-
end
|
52
|
-
end
|
47
|
+
acc << [{ val[0] => val[1] }] if chunk[0] == 'B' # add token to chunk if it is a start of chunk
|
48
|
+
|
49
|
+
next if chunk[0] != 'I'
|
53
50
|
|
54
|
-
acc
|
51
|
+
if acc.last
|
52
|
+
acc.last << { val[0] => val[1] } # add token to chunk if it is a continuation of chunk
|
53
|
+
else
|
54
|
+
acc << [{ val[0] => val[1] }] # add token to new chunk if no chunks exists
|
55
|
+
end
|
55
56
|
end
|
56
57
|
end
|
57
58
|
|
58
|
-
def
|
59
|
+
def last_probabilities
|
59
60
|
j_instance.probs.to_ary
|
60
61
|
end
|
61
62
|
end
|
62
|
-
end
|
63
|
+
end
|
data/lib/open_nlp/java_class.rb
CHANGED
data/lib/open_nlp/model.rb
CHANGED
@@ -4,6 +4,9 @@ module OpenNlp
|
|
4
4
|
|
5
5
|
attr_reader :j_model
|
6
6
|
|
7
|
+
# Initializes new instance of Model
|
8
|
+
#
|
9
|
+
# @param [String, java.io.FileInputStream] model
|
7
10
|
def initialize(model)
|
8
11
|
@j_model = self.class.java_class.new(model_stream(model))
|
9
12
|
end
|
@@ -17,7 +20,7 @@ module OpenNlp
|
|
17
20
|
when String
|
18
21
|
java.io.FileInputStream.new(model)
|
19
22
|
else
|
20
|
-
|
23
|
+
raise ArgumentError, 'Model must be either a string or a java.io.FileInputStream'
|
21
24
|
end
|
22
25
|
end
|
23
26
|
end
|
@@ -7,7 +7,8 @@ module OpenNlp
|
|
7
7
|
# @param [Array<String>] tokens tokens to run name detection on
|
8
8
|
# @return [Array<Java::opennlp.tools.util.Span>] names detected
|
9
9
|
def detect(tokens)
|
10
|
-
|
10
|
+
raise ArgumentError, 'tokens must be an instance of Array' unless tokens.is_a?(Array)
|
11
|
+
|
11
12
|
j_instance.find(tokens.to_java(:String)).to_ary
|
12
13
|
end
|
13
14
|
end
|
data/lib/open_nlp/parser.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module OpenNlp
|
2
2
|
class Parser < Tool
|
3
|
+
# Initializes new instance of Parser
|
4
|
+
#
|
5
|
+
# @param [OpenNlp::Model::Parser] parser_model
|
6
|
+
# @param [OpenNlp::Model::Tokenizer] token_model
|
3
7
|
def initialize(parser_model, token_model)
|
4
|
-
|
5
|
-
|
6
|
-
end
|
8
|
+
parser_model.is_a?(OpenNlp::Model::Parser) ||
|
9
|
+
raise(ArgumentError, 'parser_model must be an OpenNlp::Model')
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
end
|
11
|
+
token_model.is_a?(Model::Tokenizer) ||
|
12
|
+
raise(ArgumentError, 'token_model must be an OpenNlp::Tokenizer::Model')
|
11
13
|
|
12
14
|
@j_instance = Java::opennlp.tools.parser.ParserFactory.create(parser_model.j_model)
|
13
15
|
@tokenizer = Tokenizer.new(token_model)
|
@@ -19,6 +21,7 @@ module OpenNlp
|
|
19
21
|
# @return [OpenNlp::Parser::Parse]
|
20
22
|
def parse(text)
|
21
23
|
raise ArgumentError, 'passed text must be a String' unless text.is_a?(String)
|
24
|
+
|
22
25
|
text.empty? ? {} : parse_tokens(tokenizer.tokenize(text), text)
|
23
26
|
end
|
24
27
|
|
@@ -27,16 +30,14 @@ module OpenNlp
|
|
27
30
|
attr_reader :tokenizer
|
28
31
|
|
29
32
|
def get_token_offset(text, tokens, index)
|
30
|
-
|
31
|
-
return offset unless index > 0
|
33
|
+
return 0 if index.zero?
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
+
(1..index).inject(0) do |offset, i|
|
36
|
+
text.index(tokens[i], offset + tokens[i - 1].size)
|
35
37
|
end
|
36
|
-
offset
|
37
38
|
end
|
38
39
|
|
39
|
-
def build_parse_obj(text, span_start, span_end, type=Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability=1, token_index=0)
|
40
|
+
def build_parse_obj(text, span_start, span_end, type = Java::opennlp.tools.parser.AbstractBottomUpParser::INC_NODE, probability = 1, token_index = 0)
|
40
41
|
Java::opennlp.tools.parser.Parse.new(
|
41
42
|
text.to_java(:String),
|
42
43
|
Java::opennlp.tools.util.Span.new(span_start, span_end),
|
@@ -6,36 +6,57 @@ module OpenNlp
|
|
6
6
|
|
7
7
|
self.java_class = Java::opennlp.tools.parser.Parse
|
8
8
|
|
9
|
+
# Initializes instance of Parser::Parse
|
10
|
+
#
|
11
|
+
# @param [Java::opennlp.tools.parser.Parse] java_instance
|
9
12
|
def initialize(java_instance)
|
10
|
-
|
13
|
+
java_instance.is_a?(self.class.java_class) ||
|
14
|
+
raise(ArgumentError, "java_instance must be an instance of #{self.class.java_class.name}")
|
11
15
|
|
12
16
|
@j_instance = java_instance
|
13
17
|
end
|
14
18
|
|
19
|
+
# Composes tree bank string, nested string representation of sentence parts, parts-of-speech and words,
|
20
|
+
# for example:
|
21
|
+
# '(TOP (S (NP (DT The) (JJ red) (NN fox)) (VP (VBZ sleeps) (ADVP (RB soundly))) (. .)))'
|
22
|
+
#
|
23
|
+
# @return [String]
|
15
24
|
def tree_bank_string
|
16
|
-
span
|
17
|
-
|
25
|
+
span = j_instance.getSpan
|
26
|
+
text = j_instance.getText
|
27
|
+
type = j_instance.getType
|
28
|
+
res = ''
|
29
|
+
start = span.getStart
|
18
30
|
|
19
31
|
res << "(#{type} " if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
20
32
|
|
21
33
|
j_instance.getChildren.each do |child|
|
22
34
|
child_span = child.span
|
23
|
-
res << text[start..child_span.getStart-1] if start < child_span.getStart
|
35
|
+
res << text[start..child_span.getStart - 1] if start < child_span.getStart
|
24
36
|
res << self.class.new(child).tree_bank_string
|
25
37
|
start = child_span.getEnd
|
26
38
|
end
|
27
39
|
|
28
|
-
res << text[start..span.getEnd-1] if start < span.getEnd
|
29
|
-
res <<
|
40
|
+
res << text[start..span.getEnd - 1] if start < span.getEnd
|
41
|
+
res << ')' if type != Java::opennlp.tools.parser.AbstractBottomUpParser::TOK_NODE
|
30
42
|
|
31
43
|
res
|
32
44
|
end
|
33
45
|
|
46
|
+
# Composes array representation of sentence tree where
|
47
|
+
# each hash has following fields:
|
48
|
+
#
|
49
|
+
# :type => <[String] node type>,
|
50
|
+
# :parent_type => <[String] type of parent node>,
|
51
|
+
# :token => <[String] current token>,
|
52
|
+
# :children => <Array[Hash] array of child nodes hashes>
|
53
|
+
#
|
54
|
+
# @return [Array<Hash>]
|
34
55
|
def code_tree
|
35
56
|
kids = j_instance.getChildren
|
36
57
|
|
37
58
|
kids.each_with_object([]) do |kid, acc|
|
38
|
-
data
|
59
|
+
data = { type: kid.getType, parent_type: j_instance.getType, token: kid.toString }
|
39
60
|
subtree = self.class.new(kid).code_tree
|
40
61
|
data[:children] = subtree unless subtree.empty?
|
41
62
|
acc << data
|
data/lib/open_nlp/pos_tagger.rb
CHANGED
@@ -2,10 +2,13 @@ module OpenNlp
|
|
2
2
|
class POSTagger < Tool
|
3
3
|
self.java_class = Java::opennlp.tools.postag.POSTaggerME
|
4
4
|
|
5
|
+
# Adds tags to tokens passed as argument
|
6
|
+
#
|
7
|
+
# @param [Array<String>, String] tokens tokens to tag
|
8
|
+
# @return [Array<String>, String] array of part-of-speech tags or string with added part-of-speech tags
|
5
9
|
def tag(tokens)
|
6
|
-
|
7
|
-
|
8
|
-
end
|
10
|
+
!tokens.is_a?(Array) && !tokens.is_a?(String) &&
|
11
|
+
raise(ArgumentError, 'tokens must be an instance of String or Array')
|
9
12
|
|
10
13
|
j_instance.tag(tokens.to_java(:String))
|
11
14
|
end
|
@@ -7,7 +7,8 @@ module OpenNlp
|
|
7
7
|
# @param [String] string string to detect sentences in
|
8
8
|
# @return [Array<String>] array of detected sentences
|
9
9
|
def detect(str)
|
10
|
-
|
10
|
+
raise ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
|
11
12
|
j_instance.sentDetect(str).to_ary
|
12
13
|
end
|
13
14
|
|
@@ -16,10 +17,11 @@ module OpenNlp
|
|
16
17
|
# @param [String] str
|
17
18
|
# @return [Array<OpenNlp::Util::Span>] array of spans for detected sentences
|
18
19
|
def pos_detect(str)
|
19
|
-
|
20
|
+
raise ArgumentError, 'str must be a String' unless str.is_a?(String)
|
21
|
+
|
20
22
|
j_instance.sentPosDetect(str).map do |span|
|
21
23
|
OpenNlp::Util::Span.new(span.getStart, span.getEnd)
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
25
|
-
end
|
27
|
+
end
|
data/lib/open_nlp/tokenizer.rb
CHANGED
@@ -7,13 +7,14 @@ module OpenNlp
|
|
7
7
|
# @param [String] str string to tokenize
|
8
8
|
# @return [Array] array of string tokens
|
9
9
|
def tokenize(str)
|
10
|
-
|
10
|
+
raise ArgumentError, 'str must be a String' unless str.is_a?(String)
|
11
|
+
|
11
12
|
j_instance.tokenize(str).to_ary
|
12
13
|
end
|
13
14
|
|
14
15
|
private
|
15
16
|
|
16
|
-
def
|
17
|
+
def last_probabilities
|
17
18
|
j_instance.getTokenProbabilities.to_ary
|
18
19
|
end
|
19
20
|
end
|
data/lib/open_nlp/tool.rb
CHANGED
@@ -4,9 +4,13 @@ module OpenNlp
|
|
4
4
|
|
5
5
|
attr_reader :j_instance
|
6
6
|
|
7
|
+
# Initializes instance of Tool
|
8
|
+
#
|
9
|
+
# @param [OpenNlp::Model] model instance of model class to initialize a tool object
|
7
10
|
def initialize(model)
|
8
|
-
|
11
|
+
raise ArgumentError, 'model must be an OpenNlp::Model' unless model.is_a?(OpenNlp::Model)
|
12
|
+
|
9
13
|
@j_instance = self.class.java_class.new(model.j_model)
|
10
14
|
end
|
11
15
|
end
|
12
|
-
end
|
16
|
+
end
|
data/lib/open_nlp/util/span.rb
CHANGED
@@ -5,34 +5,50 @@ class OpenNlp::Util::Span
|
|
5
5
|
|
6
6
|
attr_reader :j_instance
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
8
|
+
# Initializes new instance of Util::Span
|
9
|
+
#
|
10
|
+
# @param [Integer] start start index of the span
|
11
|
+
# @param [Integer] end end index of the span
|
12
|
+
def initialize(start_pos, end_pos)
|
13
|
+
raise ArgumentError, 'start should be an integer' unless start_pos.is_a?(Integer)
|
14
|
+
raise ArgumentError, 'end should be an integer' unless end_pos.is_a?(Integer)
|
15
|
+
|
16
|
+
@j_instance = self.class.java_class.new(start_pos, end_pos)
|
13
17
|
end
|
14
18
|
|
19
|
+
# Returns end index of the span
|
20
|
+
#
|
21
|
+
# @return [Integer]
|
15
22
|
def start
|
16
23
|
j_instance.getStart
|
17
24
|
end
|
18
25
|
|
26
|
+
# Returns end index of the span
|
27
|
+
#
|
28
|
+
# @return [Integer]
|
19
29
|
def end
|
20
30
|
j_instance.getEnd
|
21
31
|
end
|
22
32
|
|
33
|
+
# Returns type of the span
|
34
|
+
#
|
35
|
+
# @return [String]
|
23
36
|
def type
|
24
37
|
j_instance.getType
|
25
38
|
end
|
26
39
|
|
40
|
+
# Returns length of the span
|
41
|
+
#
|
42
|
+
# @return [Integer]
|
27
43
|
def length
|
28
44
|
j_instance.length
|
29
45
|
end
|
30
46
|
|
31
|
-
def ==(
|
32
|
-
return false unless
|
47
|
+
def ==(other)
|
48
|
+
return false unless other.is_a?(self.class)
|
33
49
|
|
34
|
-
[
|
35
|
-
acc
|
50
|
+
%i[start end type].inject(true) do |acc, method|
|
51
|
+
acc && public_send(method) == other.public_send(method)
|
36
52
|
end
|
37
53
|
end
|
38
54
|
end
|
data/lib/open_nlp/version.rb
CHANGED
data/open_nlp.gemspec
CHANGED
@@ -1,20 +1,19 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'open_nlp/version'
|
5
4
|
|
6
5
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
6
|
+
gem.name = 'open_nlp'
|
8
7
|
gem.version = OpenNlp::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.description =
|
11
|
-
gem.summary =
|
12
|
-
gem.homepage =
|
8
|
+
gem.authors = ['Hck']
|
9
|
+
gem.description = 'JRuby tools wrapper for Apache OpenNLP'
|
10
|
+
gem.summary = 'A JRuby wrapper for the Apache OpenNLP tools library'
|
11
|
+
gem.homepage = 'http://github.com/hck/open_nlp'
|
13
12
|
|
14
|
-
gem.files = `git ls-files`.split(
|
15
|
-
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
16
15
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
-
gem.require_paths = [
|
16
|
+
gem.require_paths = ['lib']
|
18
17
|
|
19
|
-
gem.platform =
|
18
|
+
gem.platform = 'java'
|
20
19
|
end
|
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe OpenNlp::NamedEntityDetector do
|
4
4
|
let(:model) { OpenNlp::Model::NamedEntityDetector.new(File.join(FIXTURES_DIR, 'en-ner-time.bin')) }
|
5
5
|
let(:ne_detector) { described_class.new(model) }
|
6
|
-
|
6
|
+
|
7
7
|
describe 'initialization' do
|
8
8
|
it 'initializes with a valid model' do
|
9
9
|
expect(ne_detector.j_instance).to be_a(described_class.java_class)
|
data/spec/parser/parse_spec.rb
CHANGED
@@ -40,66 +40,66 @@ RSpec.describe OpenNlp::Parser::Parse do
|
|
40
40
|
let(:expected_code_tree) do
|
41
41
|
[
|
42
42
|
{
|
43
|
-
:
|
44
|
-
:
|
45
|
-
:
|
46
|
-
:
|
43
|
+
type: 'S',
|
44
|
+
parent_type: 'TOP',
|
45
|
+
token: 'The red fox sleeps soundly .',
|
46
|
+
children: [
|
47
47
|
{
|
48
|
-
:
|
49
|
-
:
|
50
|
-
:
|
51
|
-
:
|
48
|
+
type: 'NP',
|
49
|
+
parent_type: 'S',
|
50
|
+
token: 'The red fox',
|
51
|
+
children: [
|
52
52
|
{
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
type: 'DT',
|
54
|
+
parent_type: 'NP',
|
55
|
+
token: 'The',
|
56
|
+
children: [{ type: 'TK', parent_type: 'DT', token: 'The' }]
|
57
57
|
},
|
58
58
|
{
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
type: 'JJ',
|
60
|
+
parent_type: 'NP',
|
61
|
+
token: 'red',
|
62
|
+
children: [{ type: 'TK', parent_type: 'JJ', token: 'red' }]
|
63
63
|
},
|
64
64
|
{
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
65
|
+
type: 'NN',
|
66
|
+
parent_type: 'NP',
|
67
|
+
token: 'fox',
|
68
|
+
children: [{ type: 'TK', parent_type: 'NN', token: 'fox' }]
|
69
69
|
}
|
70
70
|
]
|
71
71
|
},
|
72
72
|
{
|
73
|
-
:
|
74
|
-
:
|
75
|
-
:
|
76
|
-
:
|
73
|
+
type: 'VP',
|
74
|
+
parent_type: 'S',
|
75
|
+
token: 'sleeps soundly',
|
76
|
+
children: [
|
77
77
|
{
|
78
|
-
:
|
79
|
-
:
|
80
|
-
:
|
81
|
-
:
|
78
|
+
type: 'VBZ',
|
79
|
+
parent_type: 'VP',
|
80
|
+
token: 'sleeps',
|
81
|
+
children: [{ type: 'TK', parent_type: 'VBZ', token: 'sleeps' }]
|
82
82
|
},
|
83
83
|
{
|
84
|
-
:
|
85
|
-
:
|
86
|
-
:
|
87
|
-
:
|
84
|
+
type: 'ADVP',
|
85
|
+
parent_type: 'VP',
|
86
|
+
token: 'soundly',
|
87
|
+
children: [
|
88
88
|
{
|
89
|
-
:
|
90
|
-
:
|
91
|
-
:
|
92
|
-
:
|
89
|
+
type: 'RB',
|
90
|
+
parent_type: 'ADVP',
|
91
|
+
token: 'soundly',
|
92
|
+
children: [{ type: 'TK', parent_type: 'RB', token: 'soundly' }]
|
93
93
|
}
|
94
94
|
]
|
95
95
|
}
|
96
96
|
]
|
97
97
|
},
|
98
98
|
{
|
99
|
-
:
|
100
|
-
:
|
101
|
-
:
|
102
|
-
:
|
99
|
+
type: '.',
|
100
|
+
parent_type: 'S',
|
101
|
+
token: '.',
|
102
|
+
children: [{ type: 'TK', parent_type: '.', token: '.' }]
|
103
103
|
}
|
104
104
|
]
|
105
105
|
}
|
data/spec/parser_spec.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe OpenNlp::Parser do
|
4
|
-
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR,
|
5
|
-
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR,
|
4
|
+
let(:model) { OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, 'en-parser-chunking.bin')) }
|
5
|
+
let(:token_model) { OpenNlp::Model::Tokenizer.new(File.join(FIXTURES_DIR, 'en-token.bin')) }
|
6
6
|
let(:parser) { described_class.new(model, token_model) }
|
7
7
|
|
8
8
|
describe 'initialization' do
|
data/spec/pos_tagger_spec.rb
CHANGED
@@ -21,8 +21,8 @@ RSpec.describe OpenNlp::POSTagger do
|
|
21
21
|
end
|
22
22
|
|
23
23
|
it 'tags provided tokens' do
|
24
|
-
tagged = pos_tagger.tag(%w
|
25
|
-
expect(tagged.to_ary).to eq(%w
|
24
|
+
tagged = pos_tagger.tag(%w[The quick brown fox jumps over the lazy dog .])
|
25
|
+
expect(tagged.to_ary).to eq(%w[DT JJ JJ NN NNS IN DT JJ NN .])
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'raises an ArgumentError when nil is passed as an argument' do
|
data/spec/spec_helper.rb
CHANGED
@@ -3,3 +3,28 @@ require 'java'
|
|
3
3
|
require 'open_nlp'
|
4
4
|
|
5
5
|
FIXTURES_DIR = File.join(File.dirname(__FILE__), 'fixtures')
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.expect_with :rspec do |expectations|
|
9
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
10
|
+
end
|
11
|
+
|
12
|
+
config.mock_with :rspec do |mocks|
|
13
|
+
mocks.verify_partial_doubles = true
|
14
|
+
end
|
15
|
+
|
16
|
+
config.filter_run :focus
|
17
|
+
config.run_all_when_everything_filtered = true
|
18
|
+
|
19
|
+
config.example_status_persistence_file_path = 'spec/examples.txt'
|
20
|
+
|
21
|
+
config.disable_monkey_patching!
|
22
|
+
|
23
|
+
config.warnings = true
|
24
|
+
|
25
|
+
config.profile_examples = 10
|
26
|
+
|
27
|
+
config.order = :random
|
28
|
+
|
29
|
+
Kernel.srand config.seed
|
30
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Hck
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: JRuby tools wrapper for Apache OpenNLP
|
14
14
|
email:
|
@@ -18,6 +18,7 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- ".gitignore"
|
20
20
|
- ".rspec"
|
21
|
+
- ".rubocop.yml"
|
21
22
|
- ".ruby-version"
|
22
23
|
- ".travis.yml"
|
23
24
|
- Gemfile
|
@@ -92,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
92
93
|
version: '0'
|
93
94
|
requirements: []
|
94
95
|
rubyforge_project:
|
95
|
-
rubygems_version: 2.
|
96
|
+
rubygems_version: 2.7.6
|
96
97
|
signing_key:
|
97
98
|
specification_version: 4
|
98
99
|
summary: A JRuby wrapper for the Apache OpenNLP tools library
|