stanford-core-nlp 0.1 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -1,3 +1,3 @@
1
- Treat - Text Retrieval and Annotation Toolkit
1
+ Ruby bindings for the Stanford CoreNLP package
2
2
 
3
- See the wiki for more information at https://github.com/louismullie/treat/wiki/.
3
+ See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.
@@ -1,65 +1,82 @@
1
1
  module StanfordCoreNLP
2
2
 
3
- VERSION = '0.1'
3
+ VERSION = '0.1.1'
4
4
  require 'stanford-core-nlp/jar-loader.rb'
5
-
5
+ require 'stanford-core-nlp/java-wrapper.rb'
6
+
6
7
  class << self
7
8
  # The path in which to look for the Stanford JAR files.
8
9
  # This is passed to JarLoader.
9
10
  attr_accessor :jar_path
10
11
  # The flags for starting the JVM machine.
11
12
  # Parser and named entity recognizer are very memory consuming.
12
- attr_accessor :jvm_flags
13
+ attr_accessor :jvm_args
14
+ # A file to redirect JVM output to.
15
+ attr_accessor :log_file
16
+ # The model files. Use #set_model to modify these.
17
+ attr_accessor :model_files
13
18
  end
14
19
 
20
+ # The default JAR path is the gem's bin folder.
15
21
  self.jar_path = File.dirname(__FILE__) + '/../bin/'
16
- self.jvm_flags = ['-Xms512M', '-Xmx1024M']
17
-
18
- # Return the default properties (English models with
19
- # tokenizer and sentence splitter).
20
- def self.default_properties
21
- {
22
- 'annotators' => 'tokenize, ssplit',
23
- 'pos.model' => self.jar_path + 'taggers/english-left3words-distsim.tagger',
24
- 'ner.model.3class' => self.jar_path + 'classifiers/all.3class.distsim.crf.ser.gz',
25
- 'ner.model.7class' => self.jar_path + 'classifiers/muc.7class.distsim.crf.ser.gz',
26
- 'ner.model.MISCclass' => self.jar_path + 'classifiers/conll.4class.distsim.crf.ser.gz',
27
- 'parser.model' => self.jar_path + 'grammar/englishPCFG.ser.gz',
28
- 'dcoref.demonym' => self.jar_path + 'dcoref/demonyms.txt',
29
- 'dcoref.animate' => self.jar_path + 'dcoref/animate.unigrams.txt',
30
- 'dcoref.female' => self.jar_path + 'dcoref/female.unigrams.txt',
31
- 'dcoref.inanimate' => self.jar_path + 'dcoref/inanimate.unigrams.txt',
32
- 'dcoref.male' => self.jar_path + 'dcoref/male.unigrams.txt',
33
- 'dcoref.neutral' => self.jar_path + 'dcoref/neutral.unigrams.txt',
34
- 'dcoref.plural' => self.jar_path + 'dcoref/plural.unigrams.txt',
35
- 'dcoref.singular' => self.jar_path + 'dcoref/singular.unigrams.txt',
36
- 'dcoref.states' => self.jar_path + 'dcoref/state-abbreviations.txt',
37
- 'dcoref.countries' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
38
- 'dcoref.states.provinces' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
39
- 'dcoref.extra.gender' => self.jar_path + 'dcoref/namegender.combine.txt'
40
- }
22
+ # Load the JVM with a minimum heap size of 512MB and a
23
+ # maximum heap size of 1024MB.
24
+ self.jvm_args = ['-Xms512M', '-Xmx1024M']
25
+ # Turn logging off by default.
26
+ self.log_file = nil
27
+
28
+ # Default model files.
29
+ self.model_files = {
30
+ 'pos.model' => 'taggers/english-left3words-distsim.tagger',
31
+ 'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
32
+ 'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
33
+ 'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
34
+ 'parser.model' => 'grammar/englishPCFG.ser.gz',
35
+ 'dcoref.demonym' => 'dcoref/demonyms.txt',
36
+ 'dcoref.animate' => 'dcoref/animate.unigrams.txt',
37
+ 'dcoref.female' => 'dcoref/female.unigrams.txt',
38
+ 'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
39
+ 'dcoref.male' => 'dcoref/male.unigrams.txt',
40
+ 'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
41
+ 'dcoref.plural' => 'dcoref/plural.unigrams.txt',
42
+ 'dcoref.singular' => 'dcoref/singular.unigrams.txt',
43
+ 'dcoref.states' => 'dcoref/state-abbreviations.txt',
44
+ 'dcoref.countries' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
45
+ 'dcoref.states.provinces' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
46
+ 'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
47
+ }
48
+
49
+ # Set a model file.
50
+ def self.set_model(name, file)
51
+ self.model_files[name] = file
41
52
  end
42
-
53
+
43
54
  # Load a StanfordCoreNLP pipeline with the specified JVM flags and
44
55
  # StanfordCoreNLP properties (hash of property => values).
45
- def self.load(properties)
46
- self.load_jars(jvm_flags, self.jar_path)
56
+ def self.load(*annotators)
57
+ self.load_jars(self.jvm_args, self.jar_path, self.log_file)
47
58
  self.create_classes
48
- properties = default_properties.merge(properties)
59
+ # Prepend the JAR path to the model files.
60
+ properties = {}
61
+ self.model_files.each { |k,v| properties[k] = self.jar_path + v }
62
+ properties['annotators'] =
63
+ annotators.map { |x| x.to_s }.join(', ')
49
64
  CoreNLP.new(get_properties(properties))
50
65
  end
51
66
 
52
67
  # Load the jars.
53
- def self.load_jars(jvm_flags, jar_path)
54
- JarLoader.jvm_flags = jvm_flags
68
+ def self.load_jars(jvm_args, jar_path, log_file)
69
+ JarLoader.jvm_args = jvm_args
55
70
  JarLoader.jar_path = jar_path
71
+ JarLoader.log(log_file) if log_file
56
72
  JarLoader.load('joda-time.jar')
57
73
  JarLoader.load('xom.jar')
58
74
  JarLoader.load('stanford-corenlp.jar')
59
75
  JarLoader.load('bridge.jar')
60
76
  end
61
77
 
62
- # Create the Ruby classes for core classes.
78
+ # Create the Ruby classes corresponding to the StanfordNLP
79
+ # core classes.
63
80
  def self.create_classes
64
81
  const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
65
82
  const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
@@ -76,31 +93,5 @@ module StanfordCoreNLP
76
93
  end
77
94
  props
78
95
  end
79
-
80
- Rjb::Rjb_JavaProxy.class_eval do
81
-
82
- # Get an annotation using the annotation bridge.
83
- def get(annotation)
84
- base_class = (annotation.to_s.split('_')[0] == 'coref') ?
85
- 'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
86
- 'edu.stanford.nlp.ling.CoreAnnotations$'
87
- anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
88
- url = "#{base_class}#{anno_class}Annotation"
89
- AnnotationBridge.getAnnotation(self, url)
90
- end
91
-
92
- # Shorthand for to_string defined by Java classes.
93
- def to_s; to_string; end
94
-
95
- # Provide Ruby-style iterators to wrap Java iterators.
96
- def each
97
- if !java_methods.include?('iterator()')
98
- raise 'This object cannot be iterated.'
99
- else
100
- i = self.iterator
101
- while i.has_next; yield i.next;end
102
- end
103
- end
104
- end
105
-
96
+
106
97
  end
@@ -1,5 +1,4 @@
1
1
  module StanfordCoreNLP
2
-
3
2
  class JarLoader
4
3
 
5
4
  require 'rjb'
@@ -7,28 +6,32 @@ module StanfordCoreNLP
7
6
  # Configuration options.
8
7
  class << self
9
8
  # An array of flags to pass to the JVM machine.
10
- attr_accessor :jvm_flags
9
+ attr_accessor :jvm_args
11
10
  attr_accessor :jar_path
12
11
  attr_accessor :log_file
13
12
  end
14
13
 
15
14
  # An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
16
- self.jvm_flags = []
15
+ self.jvm_args = []
17
16
  # The path in which to look for Jars.
18
17
  self.jar_path = ''
19
- # The name of the file to log to.
20
- # Setting this before the parser automatically calls self.redirect_to_log
18
+ # By default, disable logging.
21
19
  self.log_file = nil
22
-
20
+
23
21
  # Load Rjb and create Java VM.
24
22
  def self.rjb_initialize
25
23
  return if ::Rjb::loaded?
26
- ::Rjb::load(nil, self.jvm_flags)
27
- redirect_to_log if self.log_file
24
+ ::Rjb::load(nil, self.jvm_args)
25
+ set_java_logging if self.log_file
28
26
  end
29
-
30
- # Redirect the output of the JVM to self.log_file.
31
- def self.redirect_to_log
27
+
28
+ # Enable logging.
29
+ def self.log(file = 'log.txt')
30
+ self.log_file = file
31
+ end
32
+
33
+ # Redirect the output of the JVM to supplied log file.
34
+ def self.set_java_logging
32
35
  const_set(:System, Rjb::import('java.lang.System'))
33
36
  const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
34
37
  const_set(:File2, Rjb::import('java.io.File'))
@@ -37,7 +40,7 @@ module StanfordCoreNLP
37
40
  System.setOut(ps)
38
41
  System.setErr(ps)
39
42
  end
40
-
43
+
41
44
  # Load a jar.
42
45
  def self.load(jar)
43
46
  self.rjb_initialize
@@ -48,14 +51,5 @@ module StanfordCoreNLP
48
51
  ::Rjb::add_jar(jar)
49
52
  end
50
53
 
51
- # Silence output and log to file.
52
- def self.log(file = 'log.txt')
53
- @@log_file = file
54
- end
55
-
56
- # Whether the output is logged or not.
57
- def self.log?; @@log_file; end
58
-
59
54
  end
60
-
61
55
  end
@@ -0,0 +1,37 @@
1
+ module StanfordCoreNLP
2
+
3
+ # Modify the Rjb JavaProxy class to add our own methods to every Java object.
4
+ Rjb::Rjb_JavaProxy.class_eval do
5
+
6
+ # Dynamically defined on all proxied Java objects.
7
+ # Shorthand for to_string defined by Java classes.
8
+ def to_s; to_string; end
9
+
10
+ # Dynamically defined on all proxied Java iterators.
11
+ # Provide Ruby-style iterators to wrap Java iterators.
12
+ def each
13
+ if !java_methods.include?('iterator()')
14
+ raise 'This object cannot be iterated.'
15
+ else
16
+ i = self.iterator
17
+ while i.has_next; yield i.next; end
18
+ end
19
+ end
20
+
21
+ # Dynamically defined on all proxied annotation classes.
22
+ # Get an annotation using the annotation bridge.
23
+ def get(annotation)
24
+ if !java_methods.include?('get(Ljava.lang.Class;)')
25
+ raise 'No annotation can be retrieved on this object.'
26
+ else
27
+ base_class = (annotation.to_s.split('_')[0] == 'coref') ?
28
+ 'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
29
+ 'edu.stanford.nlp.ling.CoreAnnotations$'
30
+ anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
31
+ url = "#{base_class}#{anno_class}Annotation"
32
+ AnnotationBridge.getAnnotation(self, url)
33
+ end
34
+ end
35
+
36
+ end
37
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stanford-core-nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rjb
16
- requirement: &70234870930100 !ruby/object:Gem::Requirement
16
+ requirement: &70258761325680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70234870930100
25
- description: ! ' High-level Ruby bindings to the Stanford CoreNLP package, a set natural
26
- language processing tools for English, including tokenization, part-of-speech tagging,
27
- lemmatization, named entity recognition, parsing, and coreference resolution. '
24
+ version_requirements: *70258761325680
25
+ description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
26
+ language processing \ntools for English, including tokenization, part-of-speech
27
+ tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
28
28
  email:
29
29
  - louis.mullie@gmail.com
30
30
  executables: []
@@ -32,6 +32,7 @@ extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
34
  - lib/stanford-core-nlp/jar-loader.rb
35
+ - lib/stanford-core-nlp/java-wrapper.rb
35
36
  - lib/stanford-core-nlp.rb
36
37
  - bin/bridge.jar
37
38
  - bin/classifiers/all.3class.distsim.crf.ser.gz
@@ -86,5 +87,5 @@ rubyforge_project:
86
87
  rubygems_version: 1.8.15
87
88
  signing_key:
88
89
  specification_version: 3
89
- summary: Ruby bindings to the Stanford CoreNLP tools.
90
+ summary: Ruby bindings to the Stanford Core NLP tools.
90
91
  test_files: []