open_nlp 0.0.3-java → 0.0.4-java

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -7,6 +7,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
7
7
  * named entity extraction
8
8
  * chunks detection
9
9
  * parsing
10
+ * document categorization
10
11
 
11
12
  ## Installation
12
13
 
@@ -66,7 +67,7 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
66
67
  ### Parsing
67
68
 
68
69
  # parser also needs tokenizer model because it uses tokenizer inside parse task
69
- parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
70
+ parse_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-parser-chunking.bin"))
70
71
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
71
72
  parser = OpenNlp::Parser.new(parse_model, token_model)
72
73
 
@@ -79,6 +80,12 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
79
80
  # you can get code tree structure of parse result by calling
80
81
  parse_info.code_tree
81
82
 
83
+ ### Categorizing
84
+
85
+ doccat_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-doccat.bin"))
86
+ categorizer = OpenNlp::Categorizer.new(doccat_model)
87
+ categorizer.categorize("Quick brown fox jumps very bad.")
88
+
82
89
  ## Contributing
83
90
 
84
91
  1. Fork it
@@ -0,0 +1,12 @@
1
+ module OpenNlp
2
+ class Categorizer < Tool
3
+ self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
4
+
5
+ def categorize(str)
6
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
+
8
+ outcomes = @j_instance.categorize(str)
9
+ @j_instance.getBestCategory(outcomes)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Categorizer < OpenNlp::Model
2
+ self.java_class = Java::opennlp.tools.doccat.DoccatModel
3
+ end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/open_nlp.rb CHANGED
@@ -9,6 +9,7 @@ require 'open_nlp/java_class'
9
9
  require 'open_nlp/model'
10
10
 
11
11
  require 'open_nlp/model/chunker'
12
+ require 'open_nlp/model/categorizer'
12
13
  require 'open_nlp/model/detokenizer'
13
14
  require 'open_nlp/model/named_entity_detector'
14
15
  require 'open_nlp/model/pos_tagger'
@@ -17,6 +18,7 @@ require 'open_nlp/model/tokenizer'
17
18
  require 'open_nlp/model/parser'
18
19
 
19
20
  require 'open_nlp/tool'
21
+ require 'open_nlp/categorizer'
20
22
  require 'open_nlp/named_entity_detector'
21
23
  require 'open_nlp/pos_tagger'
22
24
  require 'open_nlp/sentence_detector'
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe OpenNlp::Categorizer do
4
+ subject { OpenNlp::Categorizer }
5
+ let(:model){ OpenNlp::Model::Categorizer.new(File.join(FIXTURES_DIR, "en-doccat.bin")) }
6
+
7
+ describe "initialization" do
8
+ it "should initialize with a valid model" do
9
+ categorizer = subject.new(model)
10
+ categorizer.should be_a(subject)
11
+ categorizer.j_instance.should be_a(subject.java_class)
12
+ end
13
+
14
+ it "should raise an ArgumentError without a valid model" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "categorizing a string" do
20
+ let(:categorizer) { subject.new(model) }
21
+
22
+ it "should categorize a provided document to positive" do
23
+ category = categorizer.categorize("The fox is a good worker.")
24
+ category.should == "Positive"
25
+ end
26
+
27
+ it "should categorize a provided document to negative" do
28
+ category = categorizer.categorize("Quick brown fox jumps very bad.")
29
+ category.should == "Negative"
30
+ end
31
+
32
+ it "should raise an ArgumentError for a non-string" do
33
+ lambda { categorizer.categorize(nil) }.should raise_error(ArgumentError)
34
+ end
35
+ end
36
+ end
Binary file
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.3
5
+ version: 0.0.4
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
@@ -24,9 +24,11 @@ files:
24
24
  - README.md
25
25
  - Rakefile
26
26
  - lib/open_nlp.rb
27
+ - lib/open_nlp/categorizer.rb
27
28
  - lib/open_nlp/chunker.rb
28
29
  - lib/open_nlp/java_class.rb
29
30
  - lib/open_nlp/model.rb
31
+ - lib/open_nlp/model/categorizer.rb
30
32
  - lib/open_nlp/model/chunker.rb
31
33
  - lib/open_nlp/model/detokenizer.rb
32
34
  - lib/open_nlp/model/named_entity_detector.rb
@@ -43,12 +45,13 @@ files:
43
45
  - lib/open_nlp/sentence_detector.rb
44
46
  - lib/open_nlp/tokenizer.rb
45
47
  - lib/open_nlp/tool.rb
46
- - lib/open_nlp/utils/span.rb
47
48
  - lib/open_nlp/version.rb
48
49
  - open_nlp.gemspec
50
+ - spec/categorizer_spec.rb
49
51
  - spec/chunker_spec.rb
50
52
  - spec/fixtures/en-chunker.bin
51
53
  - spec/fixtures/en-detokenizer.xml
54
+ - spec/fixtures/en-doccat.bin
52
55
  - spec/fixtures/en-ner-time.bin
53
56
  - spec/fixtures/en-parser-chunking.bin
54
57
  - spec/fixtures/en-pos-maxent.bin
@@ -94,9 +97,11 @@ signing_key:
94
97
  specification_version: 3
95
98
  summary: A JRuby wrapper for the Apache OpenNLP tools library
96
99
  test_files:
100
+ - spec/categorizer_spec.rb
97
101
  - spec/chunker_spec.rb
98
102
  - spec/fixtures/en-chunker.bin
99
103
  - spec/fixtures/en-detokenizer.xml
104
+ - spec/fixtures/en-doccat.bin
100
105
  - spec/fixtures/en-ner-time.bin
101
106
  - spec/fixtures/en-parser-chunking.bin
102
107
  - spec/fixtures/en-pos-maxent.bin
@@ -1,15 +0,0 @@
1
- module OpenNlp
2
- module Utils
3
- class Span
4
- include JavaClass
5
-
6
- self.java_class = Java::opennlp.tools.util.Span
7
-
8
- attr_reader :j_instance
9
-
10
- def initialize(start_offset, end_offset)
11
- @j_instance = self.class.java_class.new(start_offset, end_offset)
12
- end
13
- end
14
- end
15
- end