open_nlp 0.0.1-java → 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,11 @@
1
1
  # OpenNlp
2
2
 
3
- TODO: Write a gem description
3
+ A JRuby wrapper for the Apache OpenNLP tools library, that allows you execute common natural language processing tasks, such as
4
+ * sentence detection
5
+ * tokenize
6
+ * part-of-speech tagging
7
+ * named entity extraction
8
+ * chunks detection
4
9
 
5
10
  ## Installation
6
11
 
@@ -18,7 +23,37 @@ Or install it yourself as:
18
23
 
19
24
  ## Usage
20
25
 
21
- TODO: Write usage instructions here
26
+ To use open_nlp classes, you need to require it in your sources
27
+
28
+ require 'open_nlp'
29
+
30
+ Then you can create instances of open_nlp classes and use it for your nlp tasks
31
+
32
+ # sentence detection
33
+ sentence_detect_model = OpenNlp::Model::SentenceDetector.new("nlp_models/en-sent.bin")
34
+ sentence_detector = OpenNlp::SentenceDetector.new(sentence_detect_model)
35
+ sentence_detector.detect('The red fox sleeps soundly.')
36
+
37
+ # tokenize
38
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
39
+ tokenizer = OpenNlp::Tokenizer.new(token_model)
40
+ tokenizer.tokenize('The red fox sleeps soundly.')
41
+
42
+ # part-of-speech tagging
43
+ pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
44
+ pos_tagger = OpenNlp::POSTagger.new(pos_model)
45
+
46
+ # to tag string call OpenNlp::POSTagger#tag with String argument
47
+ pos_tagger.tag('The red fox sleeps soundly.')
48
+
49
+ # to tag array of tokens call OpenNlp::POSTagger#tag with Array argument
50
+ pos_tagger.tag(%w|The red fox sleeps soundly .|)
51
+
52
+ # chunks detection (chunker also needs tokenizer and pos-tagger models because it uses tokenizing and pos-tagging inside chunk task)
53
+ chunk_model = OpenNlp::Model::Chunker.new(File.join("nlp_models/en-chunker.bin"))
54
+ token_model = OpenNlp::Model::Tokenizer.new("nlp_models/en-token.bin")
55
+ pos_model = OpenNlp::Model::POSTagger.new(File.join("nlp_models/en-pos-maxent.bin"))
56
+ chunker = OpenNlp::Chunker.new(chunk_model, token_model, pos_model)
22
57
 
23
58
  ## Contributing
24
59
 
@@ -26,4 +61,4 @@ TODO: Write usage instructions here
26
61
  2. Create your feature branch (`git checkout -b my-new-feature`)
27
62
  3. Commit your changes (`git commit -am 'Add some feature'`)
28
63
  4. Push to the branch (`git push origin my-new-feature`)
29
- 5. Create new Pull Request
64
+ 5. Create new Pull Request
@@ -3,7 +3,6 @@ module OpenNlp
3
3
  self.java_class = Java::opennlp.tools.chunker.ChunkerME
4
4
 
5
5
  def initialize(model, token_model, pos_model)
6
- #raise ArgumentError, "model must be an OpenNlp::Chunker::Model" unless model.is_a?(Chunker::Model)
7
6
  super(model)
8
7
 
9
8
  raise ArgumentError, "model must be an OpenNlp::Tokenizer::Model" unless token_model.is_a?(Model::Tokenizer)
@@ -11,7 +10,6 @@ module OpenNlp
11
10
 
12
11
  @tokenizer = Tokenizer.new(token_model)
13
12
  @pos_tagger = POSTagger.new(pos_model)
14
- #@j_instance = self.java_class.new(model.j_model)
15
13
  end
16
14
 
17
15
  def chunk(str)
@@ -32,8 +30,8 @@ module OpenNlp
32
30
 
33
31
  data.inject([]) do |acc, val|
34
32
  chunk = val[2]
35
- acc << [val[0]] if chunk[0] == 'B'
36
- acc.last << val[0] if chunk[0] == 'I'
33
+ acc << [{val[0] => val[1]}] if chunk[0] == 'B'
34
+ acc.last << {val[0] => val[1]} if chunk[0] == 'I'
37
35
 
38
36
  acc
39
37
  end
@@ -1,3 +1,3 @@
1
1
  module OpenNlp
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
@@ -28,7 +28,7 @@ describe OpenNlp::Chunker do
28
28
 
29
29
  it "should chunk a sentence" do
30
30
  chunks = chunker.chunk("The red fox sleeps soundly.")
31
- chunks.should == [["The", "red", "fox", "sleeps"], ["soundly"]]
31
+ chunks.should == [[{"The"=>"DT"}, {"red"=>"JJ"}, {"fox"=>"NN"}, {"sleeps"=>"NNS"}], [{"soundly"=>"RB"}]]
32
32
  end
33
33
 
34
34
  it "should raise an error when not passed a string" do
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: open_nlp
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.0.2
6
6
  platform: java
7
7
  authors:
8
8
  - Hck
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-21 00:00:00.000000000 Z
12
+ date: 2012-09-24 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: JRuby tools wrapper for Apache OpenNLP
15
15
  email: