open_nlp 0.0.3-java → 0.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -7,6 +7,7 @@ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute co
7
7
  * named entity extraction
8
8
  * chunks detection
9
9
  * parsing
10
+ * document categorization
10
11
 
11
12
  ## Installation
12
13
 
@@ -66,7 +67,7 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
66
67
  ### Parsing
67
68
 
68
69
  # parser also needs tokenizer model because it uses tokenizer inside parse task
69
- parse_model = OpenNlp::Model::Parser.new(File.join(FIXTURES_DIR, "en-parser-chunking.bin"))
70
+ parse_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-parser-chunking.bin"))
70
71
  token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
71
72
  parser = OpenNlp::Parser.new(parse_model, token_model)
72
73
 
@@ -79,6 +80,12 @@ Then you can create instances of open_nlp classes and use it for your nlp tasks
79
80
  # you can get code tree structure of parse result by calling
80
81
  parse_info.code_tree
81
82
 
83
+ ### Categorizing
84
+
85
+ doccat_model = OpenNlp::Model::Parser.new(File.join("nlp_models/en-doccat.bin"))
86
+ categorizer = OpenNlp::Categorizer.new(doccat_model)
87
+ categorizer.categorize("Quick brown fox jumps very bad.")
88
+
82
89
  ## Contributing
83
90
 
84
91
  1. Fork it
@@ -0,0 +1,12 @@
1
+ module OpenNlp
2
+ class Categorizer < Tool
3
+ self.java_class = Java::opennlp.tools.doccat.DocumentCategorizerME
4
+
5
+ def categorize(str)
6
+ raise ArgumentError, "str must be a String" unless str.is_a?(String)
7
+
8
+ outcomes = @j_instance.categorize(str)
9
+ @j_instance.getBestCategory(outcomes)
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,3 @@
1
+ class OpenNlp::Model::Categorizer < OpenNlp::Model
2
+ self.java_class = Java::opennlp.tools.doccat.DoccatModel
3
+ end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/open_nlp.rb CHANGED
@@ -9,6 +9,7 @@ require 'open_nlp/java_class'
9
9
  require 'open_nlp/model'
10
10
 
11
11
  require 'open_nlp/model/chunker'
12
+ require 'open_nlp/model/categorizer'
12
13
  require 'open_nlp/model/detokenizer'
13
14
  require 'open_nlp/model/named_entity_detector'
14
15
  require 'open_nlp/model/pos_tagger'
@@ -17,6 +18,7 @@ require 'open_nlp/model/tokenizer'
17
18
  require 'open_nlp/model/parser'
18
19
 
19
20
  require 'open_nlp/tool'
21
+ require 'open_nlp/categorizer'
20
22
  require 'open_nlp/named_entity_detector'
21
23
  require 'open_nlp/pos_tagger'
22
24
  require 'open_nlp/sentence_detector'
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe OpenNlp::Categorizer do
4
+ subject { OpenNlp::Categorizer }
5
+ let(:model){ OpenNlp::Model::Categorizer.new(File.join(FIXTURES_DIR, "en-doccat.bin")) }
6
+
7
+ describe "initialization" do
8
+ it "should initialize with a valid model" do
9
+ categorizer = subject.new(model)
10
+ categorizer.should be_a(subject)
11
+ categorizer.j_instance.should be_a(subject.java_class)
12
+ end
13
+
14
+ it "should raise an ArgumentError without a valid model" do
15
+ lambda { subject.new(nil) }.should raise_error(ArgumentError)
16
+ end
17
+ end
18
+
19
+ describe "categorizing a string" do
20
+ let(:categorizer) { subject.new(model) }
21
+
22
+ it "should categorize a provided document to positive" do
23
+ category = categorizer.categorize("The fox is a good worker.")
24
+ category.should == "Positive"
25
+ end
26
+
27
+ it "should categorize a provided document to negative" do
28
+ category = categorizer.categorize("Quick brown fox jumps very bad.")
29
+ category.should == "Negative"
30
+ end
31
+
32
+ it "should raise an ArgumentError for a non-string" do
33
+ lambda { categorizer.categorize(nil) }.should raise_error(ArgumentError)
34
+ end
35
+ end
36
+ end
Binary file
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.3
5
+ version: 0.0.4
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
@@ -24,9 +24,11 @@ files:
24
24
  - README.md
25
25
  - Rakefile
26
26
  - lib/open_nlp.rb
27
+ - lib/open_nlp/categorizer.rb
27
28
  - lib/open_nlp/chunker.rb
28
29
  - lib/open_nlp/java_class.rb
29
30
  - lib/open_nlp/model.rb
31
+ - lib/open_nlp/model/categorizer.rb
30
32
  - lib/open_nlp/model/chunker.rb
31
33
  - lib/open_nlp/model/detokenizer.rb
32
34
  - lib/open_nlp/model/named_entity_detector.rb
@@ -43,12 +45,13 @@ files:
43
45
  - lib/open_nlp/sentence_detector.rb
44
46
  - lib/open_nlp/tokenizer.rb
45
47
  - lib/open_nlp/tool.rb
46
- - lib/open_nlp/utils/span.rb
47
48
  - lib/open_nlp/version.rb
48
49
  - open_nlp.gemspec
50
+ - spec/categorizer_spec.rb
49
51
  - spec/chunker_spec.rb
50
52
  - spec/fixtures/en-chunker.bin
51
53
  - spec/fixtures/en-detokenizer.xml
54
+ - spec/fixtures/en-doccat.bin
52
55
  - spec/fixtures/en-ner-time.bin
53
56
  - spec/fixtures/en-parser-chunking.bin
54
57
  - spec/fixtures/en-pos-maxent.bin
@@ -94,9 +97,11 @@ signing_key:
94
97
  specification_version: 3
95
98
  summary: A JRuby wrapper for the Apache OpenNLP tools library
96
99
  test_files:
100
+ - spec/categorizer_spec.rb
97
101
  - spec/chunker_spec.rb
98
102
  - spec/fixtures/en-chunker.bin
99
103
  - spec/fixtures/en-detokenizer.xml
104
+ - spec/fixtures/en-doccat.bin
100
105
  - spec/fixtures/en-ner-time.bin
101
106
  - spec/fixtures/en-parser-chunking.bin
102
107
  - spec/fixtures/en-pos-maxent.bin
@@ -1,15 +0,0 @@
1
- module OpenNlp
2
- module Utils
3
- class Span
4
- include JavaClass
5
-
6
- self.java_class = Java::opennlp.tools.util.Span
7
-
8
- attr_reader :j_instance
9
-
10
- def initialize(start_offset, end_offset)
11
- @j_instance = self.class.java_class.new(start_offset, end_offset)
12
- end
13
- end
14
- end
15
- end