open_nlp 0.0.1-java → 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +38 -3
- data/lib/open_nlp/chunker.rb +2 -4
- data/lib/open_nlp/version.rb +1 -1
- data/spec/chunker_spec.rb +1 -1
- metadata +2 -2
data/README.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# OpenNlp
|
2
2
|
|
3
|
-
|
3
|
+
A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
|
4
|
+
* sentence detection
|
5
|
+
* tokenize
|
6
|
+
* part-of-speech tagging
|
7
|
+
* named entity extraction
|
8
|
+
* chunks detection
|
4
9
|
|
5
10
|
## Installation
|
6
11
|
|
@@ -18,7 +23,37 @@ Or install it yourself as:
|
|
18
23
|
|
19
24
|
## Usage
|
20
25
|
|
21
|
-
|
26
|
+
To use open_nlp classes, you need to require it in your sources
|
27
|
+
|
28
|
+
require 'open_nlp'
|
29
|
+
|
30
|
+
Then you can create instances of open_nlp classes and use it for your nlp tasks
|
31
|
+
|
32
|
+
# sentence detection
|
33
|
+
sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
|
34
|
+
sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
|
35
|
+
sentence_detector.detect('The red fox sleeps soundly.')
|
36
|
+
|
37
|
+
# tokenize
|
38
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
39
|
+
tokenizer = OpenNlp::Tokenizer.new(token_model)
|
40
|
+
tokenizer.tokenize('The red fox sleeps soundly.')
|
41
|
+
|
42
|
+
# part-of-speech tagging
|
43
|
+
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
44
|
+
pos_tagger = OpenNlp::POSTagger.new(pos_model)
|
45
|
+
|
46
|
+
# to tag string call OpenNlp::POSTagger#tag with String argument
|
47
|
+
pos_tagger.tag('The red fox sleeps soundly.')
|
48
|
+
|
49
|
+
# to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
|
50
|
+
pos_tagger.tag(%w|The red fox sleeps soundly .|)
|
51
|
+
|
52
|
+
# chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
|
53
|
+
chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
|
54
|
+
token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
|
55
|
+
pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
|
56
|
+
chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
|
22
57
|
|
23
58
|
## Contributing
|
24
59
|
|
@@ -26,4 +61,4 @@ TODO: Write usage instructions here
|
|
26
61
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
62
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
63
|
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
64
|
+
5. Create new Pull Request
|
data/lib/open_nlp/chunker.rb
CHANGED
@@ -3,7 +3,6 @@ module OpenNlp
|
|
3
3
|
self.java_class = Java::opennlp.tools.chunker.ChunkerME
|
4
4
|
|
5
5
|
def initialize(model, token_model, pos_model)
|
6
|
-
#raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
|
7
6
|
super(model)
|
8
7
|
|
9
8
|
raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
|
@@ -11,7 +10,6 @@ module OpenNlp
|
|
11
10
|
|
12
11
|
@tokenizer = Tokenizer.new(token_model)
|
13
12
|
@pos_tagger = POSTagger.new(pos_model)
|
14
|
-
#@j_instance = self.java_class.new(model.j_model)
|
15
13
|
end
|
16
14
|
|
17
15
|
def chunk(str)
|
@@ -32,8 +30,8 @@ module OpenNlp
|
|
32
30
|
|
33
31
|
data.inject([]) do |acc, val|
|
34
32
|
chunk = val[2]
|
35
|
-
acc << [val[0]] if chunk[0] == 'B'
|
36
|
-
acc.last << val[0] if chunk[0] == 'I'
|
33
|
+
acc << [{val[0] => val[1]}] if chunk[0] == 'B'
|
34
|
+
acc.last << {val[0] => val[1]} if chunk[0] == 'I'
|
37
35
|
|
38
36
|
acc
|
39
37
|
end
|
data/lib/open_nlp/version.rb
CHANGED
data/spec/chunker_spec.rb
CHANGED
@@ -28,7 +28,7 @@ describe OpenNlp::Chunker do
|
|
28
28
|
|
29
29
|
it "should chunk a sentence" do
|
30
30
|
chunks = chunker.chunk("The red fox sleeps soundly.")
|
31
|
-
chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
|
31
|
+
chunks.should == [[{"The"=>"DT"}, {"red"=>"JJ"}, {"fox"=>"NN"}, {"sleeps"=>"NNS"}], [{"soundly"=>"RB"}]]
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should raise an error when not passed a string" do
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: open_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Hck
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-09-
|
12
|
+
date: 2012-09-24 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: JRuby tools wrapper for Apache OpenNLP
|
15
15
|
email:
|