open_nlp 0.0.3-java → 0.0.4-java
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +8 -1
- data/lib/open_nlp/categorizer.rb +12 -0
- data/lib/open_nlp/model/categorizer.rb +3 -0
- data/lib/open_nlp/version.rb +1 -1
- data/lib/open_nlp.rb +2 -0
- data/spec/categorizer_spec.rb +36 -0
- data/spec/fixtures/en-doccat.bin +0 -0
- metadata +7 -2
- data/lib/open_nlp/utils/span.rb +0 -15
data/README.md
CHANGED
@@ -7,6 +7,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
|
|
7
7
|
* named entity extraction
|
8
8
|
* chunks detection
|
9
9
|
* parsing
|
10
|
+
* document categorization
|
10
11
|
|
11
12
|
## Installation
|
12
13
|
|
@@ -66,7 +67,7 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
66
67
|
### Parsing
|
67
68
|
|
68
69
|
# parser also needs tokenizer model because it uses tokenizer inside parse task
|
69
|
-
parse_model = OpenNlp::Model::Parser.new(File.join(
|
70
|
+
parse_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-parser-chunking.bin"))
|
70
71
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
71
72
|
parser = OpenNlp::Parser.new(parse_model, token_model)
|
72
73
|
|
@@ -79,6 +80,12 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
79
80
|
# you can get code tree structure of parse result by calling
|
80
81
|
parse_info.code_tree
|
81
82
|
|
83
|
+
### Categorizing
|
84
|
+
|
85
|
+
doccat_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-doccat.bin"))
|
86
|
+
categorizer = OpenNlp::Categorizer.new(doccat_model)
|
87
|
+
categorizer.categorize("Quick brown fox jumps very bad.")
|
88
|
+
|
82
89
|
## Contributing
|
83
90
|
|
84
91
|
1. Fork it
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Categorizer < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
|
4
|
+
|
5
|
+
def categorize(str)
|
6
|
+
raise ArgumentError, "str must be a String" unless str.is_a?(String)
|
7
|
+
|
8
|
+
outcomes = @j_instance.categorize(str)
|
9
|
+
@j_instance.getBestCategory(outcomes)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/open_nlp/version.rb
CHANGED
data/lib/open_nlp.rb
CHANGED
@@ -9,6 +9,7 @@ require 'open_nlp/java_class'
|
|
9
9
|
require 'open_nlp/model'
|
10
10
|
|
11
11
|
require 'open_nlp/model/chunker'
|
12
|
+
require 'open_nlp/model/categorizer'
|
12
13
|
require 'open_nlp/model/detokenizer'
|
13
14
|
require 'open_nlp/model/named_entity_detector'
|
14
15
|
require 'open_nlp/model/pos_tagger'
|
@@ -17,6 +18,7 @@ require 'open_nlp/model/tokenizer'
|
|
17
18
|
require 'open_nlp/model/parser'
|
18
19
|
|
19
20
|
require 'open_nlp/tool'
|
21
|
+
require 'open_nlp/categorizer'
|
20
22
|
require 'open_nlp/named_entity_detector'
|
21
23
|
require 'open_nlp/pos_tagger'
|
22
24
|
require 'open_nlp/sentence_detector'
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe OpenNlp::Categorizer do
|
4
|
+
subject { OpenNlp::Categorizer }
|
5
|
+
let(:model){ OpenNlp::Model::Categorizer.new(File.join(FIXTURES_DIR, "en-doccat.bin")) }
|
6
|
+
|
7
|
+
describe "initialization" do
|
8
|
+
it "should initialize with a valid model" do
|
9
|
+
categorizer = subject.new(model)
|
10
|
+
categorizer.should be_a(subject)
|
11
|
+
categorizer.j_instance.should be_a(subject.java_class)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an ArgumentError without a valid model" do
|
15
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "categorizing a string" do
|
20
|
+
let(:categorizer) { subject.new(model) }
|
21
|
+
|
22
|
+
it "should categorize a provided document to positive" do
|
23
|
+
category = categorizer.categorize("The fox is a good worker.")
|
24
|
+
category.should == "Positive"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should categorize a provided document to negative" do
|
28
|
+
category = categorizer.categorize("Quick brown fox jumps very bad.")
|
29
|
+
category.should == "Negative"
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should raise an ArgumentError for a non-string" do
|
33
|
+
lambda { categorizer.categorize(nil) }.should raise_error(ArgumentError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
Binary file
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.4
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
@@ -24,9 +24,11 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- Rakefile
|
26
26
|
- lib/open_nlp.rb
|
27
|
+
- lib/open_nlp/categorizer.rb
|
27
28
|
- lib/open_nlp/chunker.rb
|
28
29
|
- lib/open_nlp/java_class.rb
|
29
30
|
- lib/open_nlp/model.rb
|
31
|
+
- lib/open_nlp/model/categorizer.rb
|
30
32
|
- lib/open_nlp/model/chunker.rb
|
31
33
|
- lib/open_nlp/model/detokenizer.rb
|
32
34
|
- lib/open_nlp/model/named_entity_detector.rb
|
@@ -43,12 +45,13 @@ files:
|
|
43
45
|
- lib/open_nlp/sentence_detector.rb
|
44
46
|
- lib/open_nlp/tokenizer.rb
|
45
47
|
- lib/open_nlp/tool.rb
|
46
|
-
- lib/open_nlp/utils/span.rb
|
47
48
|
- lib/open_nlp/version.rb
|
48
49
|
- open_nlp.gemspec
|
50
|
+
- spec/categorizer_spec.rb
|
49
51
|
- spec/chunker_spec.rb
|
50
52
|
- spec/fixtures/en-chunker.bin
|
51
53
|
- spec/fixtures/en-detokenizer.xml
|
54
|
+
- spec/fixtures/en-doccat.bin
|
52
55
|
- spec/fixtures/en-ner-time.bin
|
53
56
|
- spec/fixtures/en-parser-chunking.bin
|
54
57
|
- spec/fixtures/en-pos-maxent.bin
|
@@ -94,9 +97,11 @@ signing_key:
|
|
94
97
|
specification_version: 3
|
95
98
|
summary: A JRuby wrapper for the Apache OpenNLP tools library
|
96
99
|
test_files:
|
100
|
+
- spec/categorizer_spec.rb
|
97
101
|
- spec/chunker_spec.rb
|
98
102
|
- spec/fixtures/en-chunker.bin
|
99
103
|
- spec/fixtures/en-detokenizer.xml
|
104
|
+
- spec/fixtures/en-doccat.bin
|
100
105
|
- spec/fixtures/en-ner-time.bin
|
101
106
|
- spec/fixtures/en-parser-chunking.bin
|
102
107
|
- spec/fixtures/en-pos-maxent.bin
|
data/lib/open_nlp/utils/span.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
module OpenNlp
|
2
|
-
module Utils
|
3
|
-
class Span
|
4
|
-
include JavaClass
|
5
|
-
|
6
|
-
self.java_class = Java::opennlp.tools.util.Span
|
7
|
-
|
8
|
-
attr_reader :j_instance
|
9
|
-
|
10
|
-
def initialize(start_offset, end_offset)
|
11
|
-
@j_instance = self.class.java_class.new(start_offset, end_offset)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|