open_nlp 0.0.1-java → 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,11 @@
1
1
  # OpenNlp
2
2
 
3
- TODO: Write a gem description
3
+ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
4
+ * sentence detection
5
+ * tokenize
6
+ * part-of-speech tagging
7
+ * named entity extraction
8
+ * chunks detection
4
9
 
5
10
  ## Installation
6
11
 
@@ -18,7 +23,37 @@ Or install it yourself as:
18
23
 
19
24
  ## Usage
20
25
 
21
- TODO: Write usage instructions here
26
+ To use open_nlp classes, you need to require it in your sources
27
+
28
+ require 'open_nlp'
29
+
30
+ Then you can create instances of open_nlp classes and use it for your nlp tasks
31
+
32
+ # sentence detection
33
+ sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
34
+ sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
35
+ sentence_detector.detect('The red fox sleeps soundly.')
36
+
37
+ # tokenize
38
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
39
+ tokenizer = OpenNlp::Tokenizer.new(token_model)
40
+ tokenizer.tokenize('The red fox sleeps soundly.')
41
+
42
+ # part-of-speech tagging
43
+ pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
44
+ pos_tagger = OpenNlp::POSTagger.new(pos_model)
45
+
46
+ # to tag string call OpenNlp::POSTagger#tag with String argument
47
+ pos_tagger.tag('The red fox sleeps soundly.')
48
+
49
+ # to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
50
+ pos_tagger.tag(%w|The red fox sleeps soundly .|)
51
+
52
+ # chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
53
+ chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
54
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
55
+ pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
56
+ chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
22
57
 
23
58
  ## Contributing
24
59
 
@@ -26,4 +61,4 @@ TODO: Write usage instructions here
26
61
  2. Create your feature branch (`git checkout -b my-new-feature`)
27
62
  3. Commit your changes (`git commit -am 'Add some feature'`)
28
63
  4. Push to the branch (`git push origin my-new-feature`)
29
- 5. Create new Pull Request
64
+ 5. Create new Pull Request
@@ -3,7 +3,6 @@ module OpenNlp
3
3
  self.java_class = Java::opennlp.tools.chunker.ChunkerME
4
4
 
5
5
  def initialize(model, token_model, pos_model)
6
- #raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
7
6
  super(model)
8
7
 
9
8
  raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
@@ -11,7 +10,6 @@ module OpenNlp
11
10
 
12
11
  @tokenizer = Tokenizer.new(token_model)
13
12
  @pos_tagger = POSTagger.new(pos_model)
14
- #@j_instance = self.java_class.new(model.j_model)
15
13
  end
16
14
 
17
15
  def chunk(str)
@@ -32,8 +30,8 @@ module OpenNlp
32
30
 
33
31
  data.inject([]) do |acc, val|
34
32
  chunk = val[2]
35
- acc << [val[0]] if chunk[0] == 'B'
36
- acc.last << val[0] if chunk[0] == 'I'
33
+ acc << [{val[0] => val[1]}] if chunk[0] == 'B'
34
+ acc.last << {val[0] => val[1]} if chunk[0] == 'I'
37
35
 
38
36
  acc
39
37
  end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
@@ -28,7 +28,7 @@ describe OpenNlp::Chunker do
28
28
 
29
29
  it "should chunk a sentence" do
30
30
  chunks = chunker.chunk("The red fox sleeps soundly.")
31
- chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
31
+ chunks.should == [[{"The"=>"DT"}, {"red"=>"JJ"}, {"fox"=>"NN"}, {"sleeps"=>"NNS"}], [{"soundly"=>"RB"}]]
32
32
  end
33
33
 
34
34
  it "should raise an error when not passed a string" do
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.0.2
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-21 00:00:00.000000000 Z
12
+ date: 2012-09-24 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: JRuby tools wrapper for Apache OpenNLP
15
15
  email: