open_nlp 0.0.3-java → 0.0.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +8 -1
- data/lib/open_nlp/categorizer.rb +12 -0
- data/lib/open_nlp/model/categorizer.rb +3 -0
- data/lib/open_nlp/version.rb +1 -1
- data/lib/open_nlp.rb +2 -0
- data/spec/categorizer_spec.rb +36 -0
- data/spec/fixtures/en-doccat.bin +0 -0
- metadata +7 -2
- data/lib/open_nlp/utils/span.rb +0 -15
data/README.md
CHANGED
@@ -7,6 +7,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
|
|
7
7
|
* named entity extraction
|
8
8
|
* chunks detection
|
9
9
|
* parsing
|
10
|
+
* document categorization
|
10
11
|
|
11
12
|
## Installation
|
12
13
|
|
@@ -66,7 +67,7 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
66
67
|
### Parsing
|
67
68
|
|
68
69
|
# parser also needs tokenizer model because it uses tokenizer inside parse task
|
69
|
-
parse_model = OpenNlp::Model::Parser.new(File.join(
|
70
|
+
parse_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-parser-chunking.bin"))
|
70
71
|
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
71
72
|
parser = OpenNlp::Parser.new(parse_model, token_model)
|
72
73
|
|
@@ -79,6 +80,12 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
|
|
79
80
|
# you can get code tree structure of parse result by calling
|
80
81
|
parse_info.code_tree
|
81
82
|
|
83
|
+
### Categorizing
|
84
|
+
|
85
|
+
doccat_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-doccat.bin"))
|
86
|
+
categorizer = OpenNlp::Categorizer.new(doccat_model)
|
87
|
+
categorizer.categorize("Quick brown fox jumps very bad.")
|
88
|
+
|
82
89
|
## Contributing
|
83
90
|
|
84
91
|
1. Fork it
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module OpenNlp
|
2
|
+
class Categorizer < Tool
|
3
|
+
self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
|
4
|
+
|
5
|
+
def categorize(str)
|
6
|
+
raise ArgumentError, "str must be a String" unless str.is_a?(String)
|
7
|
+
|
8
|
+
outcomes = @j_instance.categorize(str)
|
9
|
+
@j_instance.getBestCategory(outcomes)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/open_nlp/version.rb
CHANGED
data/lib/open_nlp.rb
CHANGED
@@ -9,6 +9,7 @@ require 'open_nlp/java_class'
|
|
9
9
|
require 'open_nlp/model'
|
10
10
|
|
11
11
|
require 'open_nlp/model/chunker'
|
12
|
+
require 'open_nlp/model/categorizer'
|
12
13
|
require 'open_nlp/model/detokenizer'
|
13
14
|
require 'open_nlp/model/named_entity_detector'
|
14
15
|
require 'open_nlp/model/pos_tagger'
|
@@ -17,6 +18,7 @@ require 'open_nlp/model/tokenizer'
|
|
17
18
|
require 'open_nlp/model/parser'
|
18
19
|
|
19
20
|
require 'open_nlp/tool'
|
21
|
+
require 'open_nlp/categorizer'
|
20
22
|
require 'open_nlp/named_entity_detector'
|
21
23
|
require 'open_nlp/pos_tagger'
|
22
24
|
require 'open_nlp/sentence_detector'
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe OpenNlp::Categorizer do
|
4
|
+
subject { OpenNlp::Categorizer }
|
5
|
+
let(:model){ OpenNlp::Model::Categorizer.new(File.join(FIXTURES_DIR, "en-doccat.bin")) }
|
6
|
+
|
7
|
+
describe "initialization" do
|
8
|
+
it "should initialize with a valid model" do
|
9
|
+
categorizer = subject.new(model)
|
10
|
+
categorizer.should be_a(subject)
|
11
|
+
categorizer.j_instance.should be_a(subject.java_class)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should raise an ArgumentError without a valid model" do
|
15
|
+
lambda { subject.new(nil) }.should raise_error(ArgumentError)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "categorizing a string" do
|
20
|
+
let(:categorizer) { subject.new(model) }
|
21
|
+
|
22
|
+
it "should categorize a provided document to positive" do
|
23
|
+
category = categorizer.categorize("The fox is a good worker.")
|
24
|
+
category.should == "Positive"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should categorize a provided document to negative" do
|
28
|
+
category = categorizer.categorize("Quick brown fox jumps very bad.")
|
29
|
+
category.should == "Negative"
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should raise an ArgumentError for a non-string" do
|
33
|
+
lambda { categorizer.categorize(nil) }.should raise_error(ArgumentError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
Binary file
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.4
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
@@ -24,9 +24,11 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- Rakefile
|
26
26
|
- lib/open_nlp.rb
|
27
|
+
- lib/open_nlp/categorizer.rb
|
27
28
|
- lib/open_nlp/chunker.rb
|
28
29
|
- lib/open_nlp/java_class.rb
|
29
30
|
- lib/open_nlp/model.rb
|
31
|
+
- lib/open_nlp/model/categorizer.rb
|
30
32
|
- lib/open_nlp/model/chunker.rb
|
31
33
|
- lib/open_nlp/model/detokenizer.rb
|
32
34
|
- lib/open_nlp/model/named_entity_detector.rb
|
@@ -43,12 +45,13 @@ files:
|
|
43
45
|
- lib/open_nlp/sentence_detector.rb
|
44
46
|
- lib/open_nlp/tokenizer.rb
|
45
47
|
- lib/open_nlp/tool.rb
|
46
|
-
- lib/open_nlp/utils/span.rb
|
47
48
|
- lib/open_nlp/version.rb
|
48
49
|
- open_nlp.gemspec
|
50
|
+
- spec/categorizer_spec.rb
|
49
51
|
- spec/chunker_spec.rb
|
50
52
|
- spec/fixtures/en-chunker.bin
|
51
53
|
- spec/fixtures/en-detokenizer.xml
|
54
|
+
- spec/fixtures/en-doccat.bin
|
52
55
|
- spec/fixtures/en-ner-time.bin
|
53
56
|
- spec/fixtures/en-parser-chunking.bin
|
54
57
|
- spec/fixtures/en-pos-maxent.bin
|
@@ -94,9 +97,11 @@ signing_key:
|
|
94
97
|
specification_version: 3
|
95
98
|
summary: A JRuby wrapper for the Apache OpenNLP tools library
|
96
99
|
test_files:
|
100
|
+
- spec/categorizer_spec.rb
|
97
101
|
- spec/chunker_spec.rb
|
98
102
|
- spec/fixtures/en-chunker.bin
|
99
103
|
- spec/fixtures/en-detokenizer.xml
|
104
|
+
- spec/fixtures/en-doccat.bin
|
100
105
|
- spec/fixtures/en-ner-time.bin
|
101
106
|
- spec/fixtures/en-parser-chunking.bin
|
102
107
|
- spec/fixtures/en-pos-maxent.bin
|
data/lib/open_nlp/utils/span.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
module OpenNlp
|
2
|
-
module Utils
|
3
|
-
class Span
|
4
|
-
include JavaClass
|
5
|
-
|
6
|
-
self.java_class = Java::opennlp.tools.util.Span
|
7
|
-
|
8
|
-
attr_reader :j_instance
|
9
|
-
|
10
|
-
def initialize(start_offset, end_offset)
|
11
|
-
@j_instance = self.class.java_class.new(start_offset, end_offset)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|