open_nlp 0.0.1-java → 0.0.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +38 -3
- data/lib/open_nlp/chunker.rb +2 -4
- data/lib/open_nlp/version.rb +1 -1
- data/spec/chunker_spec.rb +1 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# OpenNlp
|
2
2
|
|
3
|
-
|
3
|
+
A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
|
4
|
+
* sentence detection
|
5
|
+
* tokenize
|
6
|
+
* part-of-speech tagging
|
7
|
+
* named entity extraction
|
8
|
+
* chunks detection
|
4
9
|
|
5
10
|
## Installation
|
6
11
|
|
@@ -18,7 +23,37 @@ Or install it yourself as:
|
|
18
23
|
|
19
24
|
## Usage
|
20
25
|
|
21
|
-
|
26
|
+
To use open_nlp classes, you need to require it in your sources
|
27
|
+
|
28
|
+
require 'open_nlp'
|
29
|
+
|
30
|
+
Then you can create instances of open_nlp classes and use it for your nlp tasks
|
31
|
+
|
32
|
+
# sentence detection
|
33
|
+
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
34
|
+
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
35
|
+
sentence_detector.detect('The red fox sleeps soundly.')
|
36
|
+
|
37
|
+
# tokenize
|
38
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
39
|
+
tokenizer = OpenNlp::Tokenizer.new(token_model)
|
40
|
+
tokenizer.tokenize('The red fox sleeps soundly.')
|
41
|
+
|
42
|
+
# part-of-speech tagging
|
43
|
+
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
44
|
+
pos_tagger = OpenNlp::POSTagger.new(pos_model)
|
45
|
+
|
46
|
+
# to tag string call OpenNlp::POSTagger#tag with String argument
|
47
|
+
pos_tagger.tag('The red fox sleeps soundly.')
|
48
|
+
|
49
|
+
# to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
|
50
|
+
pos_tagger.tag(%w|The red fox sleeps soundly .|)
|
51
|
+
|
52
|
+
# chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
|
53
|
+
chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
|
54
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
55
|
+
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
56
|
+
chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
|
22
57
|
|
23
58
|
## Contributing
|
24
59
|
|
@@ -26,4 +61,4 @@ TODO: Write usage instructions here
|
|
26
61
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
62
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
63
|
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
64
|
+
5. Create new Pull Request
|
data/lib/open_nlp/chunker.rb
CHANGED
@@ -3,7 +3,6 @@ module OpenNlp
|
|
3
3
|
self.java_class = Java::opennlp.tools.chunker.ChunkerME
|
4
4
|
|
5
5
|
def initialize(model, token_model, pos_model)
|
6
|
-
#raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
|
7
6
|
super(model)
|
8
7
|
|
9
8
|
raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
|
@@ -11,7 +10,6 @@ module OpenNlp
|
|
11
10
|
|
12
11
|
@tokenizer = Tokenizer.new(token_model)
|
13
12
|
@pos_tagger = POSTagger.new(pos_model)
|
14
|
-
#@j_instance = self.java_class.new(model.j_model)
|
15
13
|
end
|
16
14
|
|
17
15
|
def chunk(str)
|
@@ -32,8 +30,8 @@ module OpenNlp
|
|
32
30
|
|
33
31
|
data.inject([]) do |acc, val|
|
34
32
|
chunk = val[2]
|
35
|
-
acc << [val[0]] if chunk[0] == 'B'
|
36
|
-
acc.last << val[0] if chunk[0] == 'I'
|
33
|
+
acc << [{val[0] => val[1]}] if chunk[0] == 'B'
|
34
|
+
acc.last << {val[0] => val[1]} if chunk[0] == 'I'
|
37
35
|
|
38
36
|
acc
|
39
37
|
end
|
data/lib/open_nlp/version.rb
CHANGED
data/spec/chunker_spec.rb
CHANGED
@@ -28,7 +28,7 @@ describe OpenNlp::Chunker do
|
|
28
28
|
|
29
29
|
it "should chunk a sentence" do
|
30
30
|
chunks = chunker.chunk("The red fox sleeps soundly.")
|
31
|
-
chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
|
31
|
+
chunks.should == [[{"The"=>"DT"}, {"red"=>"JJ"}, {"fox"=>"NN"}, {"sleeps"=>"NNS"}], [{"soundly"=>"RB"}]]
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should raise an error when not passed a string" do
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-24 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: JRuby tools wrapper for Apache OpenNLP
|
15
15
|
email:
|