RubyGems - stanford-core-nlp - Versions diffs - 0.1 → 0.1.1 - Mend

stanford-core-nlp 0.1 → 0.1.1

Files changed (5) hide show

data/README +2 -2
data/lib/stanford-core-nlp.rb +53 -62
data/lib/stanford-core-nlp/jar-loader.rb +15 -21
data/lib/stanford-core-nlp/java-wrapper.rb +37 -0
metadata +8 -7

data/README CHANGED Viewed

@@ -1,3 +1,3 @@
-Treat - Text Retrieval and Annotation Toolkit
+Ruby bindings for the Stanford CoreNLP package
-See the wiki for more information at https://github.com/louismullie/treat/wiki/.
+See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.

data/lib/stanford-core-nlp.rb CHANGED Viewed

@@ -1,65 +1,82 @@
 module StanfordCoreNLP
-  VERSION = '0.1'
+  VERSION = '0.1.1'
   require 'stanford-core-nlp/jar-loader.rb'
+  require 'stanford-core-nlp/java-wrapper.rb'
   class << self
     # The path in which to look for the Stanford JAR files.
     # This is passed to JarLoader.
     attr_accessor :jar_path
     # The flags for starting the JVM machine.
     # Parser and named entity recognizer are very memory consuming.
-    attr_accessor :jvm_flags
+    attr_accessor :jvm_args
+    # A file to redirect JVM output to.
+    attr_accessor :log_file
+    # The model files. Use #set_model to modify these.
+    attr_accessor :model_files
   end
+  # The default JAR path is the gem's bin folder.
   self.jar_path = File.dirname(__FILE__) + '/../bin/'
-  self.jvm_flags = ['-Xms512M', '-Xmx1024M']
-  # Return the default properties (English models with
-  # tokenizer and sentence splitter).
-  def self.default_properties
-    {
-      'annotators' => 'tokenize, ssplit',
-      'pos.model' => self.jar_path + 'taggers/english-left3words-distsim.tagger',
-      'ner.model.3class' => self.jar_path + 'classifiers/all.3class.distsim.crf.ser.gz',
-      'ner.model.7class' => self.jar_path + 'classifiers/muc.7class.distsim.crf.ser.gz',
-      'ner.model.MISCclass' => self.jar_path + 'classifiers/conll.4class.distsim.crf.ser.gz',
-      'parser.model' => self.jar_path + 'grammar/englishPCFG.ser.gz',
-      'dcoref.demonym' => self.jar_path + 'dcoref/demonyms.txt',
-      'dcoref.animate' => self.jar_path + 'dcoref/animate.unigrams.txt',
-      'dcoref.female' => self.jar_path + 'dcoref/female.unigrams.txt',
-      'dcoref.inanimate' => self.jar_path + 'dcoref/inanimate.unigrams.txt',
-      'dcoref.male' => self.jar_path + 'dcoref/male.unigrams.txt',
-      'dcoref.neutral' => self.jar_path + 'dcoref/neutral.unigrams.txt',
-      'dcoref.plural' => self.jar_path + 'dcoref/plural.unigrams.txt',
-      'dcoref.singular' => self.jar_path + 'dcoref/singular.unigrams.txt',
-      'dcoref.states' => self.jar_path + 'dcoref/state-abbreviations.txt',
-      'dcoref.countries' => self.jar_path + 'dcoref/unknown.txt',     # Fix - can somebody provide this file?
-      'dcoref.states.provinces' => self.jar_path + 'dcoref/unknown.txt',   # Fix - can somebody provide this file?
-      'dcoref.extra.gender' => self.jar_path + 'dcoref/namegender.combine.txt'
-    }
+  # Load the JVM with a minimum heap size of 512MB and a
+  # maximum heap size of 1024MB.
+  self.jvm_args = ['-Xms512M', '-Xmx1024M']
+  # Turn logging off by default.
+  self.log_file = nil
+  # Default model files.
+  self.model_files = {
+    'pos.model' => 'taggers/english-left3words-distsim.tagger',
+    'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
+    'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
+    'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
+    'parser.model' => 'grammar/englishPCFG.ser.gz',
+    'dcoref.demonym' => 'dcoref/demonyms.txt',
+    'dcoref.animate' => 'dcoref/animate.unigrams.txt',
+    'dcoref.female' => 'dcoref/female.unigrams.txt',
+    'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
+    'dcoref.male' => 'dcoref/male.unigrams.txt',
+    'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
+    'dcoref.plural' => 'dcoref/plural.unigrams.txt',
+    'dcoref.singular' => 'dcoref/singular.unigrams.txt',
+    'dcoref.states' => 'dcoref/state-abbreviations.txt',
+    'dcoref.countries' => 'dcoref/unknown.txt',     # Fix - can somebody provide this file?
+    'dcoref.states.provinces' => 'dcoref/unknown.txt',   # Fix - can somebody provide this file?
+    'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
+  }
+  # Set a model file.
+  def self.set_model(name, file)
+    self.model_files[name] = file
   end
   # Load a StanfordCoreNLP pipeline with the specified JVM flags and
   # StanfordCoreNLP properties (hash of property => values).
-  def self.load(properties)
-    self.load_jars(jvm_flags, self.jar_path)
+  def self.load(*annotators)
+    self.load_jars(self.jvm_args, self.jar_path, self.log_file)
     self.create_classes
-    properties = default_properties.merge(properties)
+    # Prepend the JAR path to the model files.
+    properties = {}
+    self.model_files.each { |k,v| properties[k] = self.jar_path + v }
+    properties['annotators'] =
+    annotators.map { |x| x.to_s }.join(', ')
     CoreNLP.new(get_properties(properties))
   end
   # Load the jars.
-  def self.load_jars(jvm_flags, jar_path)
-    JarLoader.jvm_flags = jvm_flags
+  def self.load_jars(jvm_args, jar_path, log_file)
+    JarLoader.jvm_args = jvm_args
     JarLoader.jar_path = jar_path
+    JarLoader.log(log_file) if log_file
     JarLoader.load('joda-time.jar')
     JarLoader.load('xom.jar')
     JarLoader.load('stanford-corenlp.jar')
     JarLoader.load('bridge.jar')
   end
-  # Create the Ruby classes for core classes.
+  # Create the Ruby classes corresponding to the StanfordNLP
+  # core classes.
   def self.create_classes
     const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
     const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
@@ -76,31 +93,5 @@ module StanfordCoreNLP
     end
     props
   end
-  Rjb::Rjb_JavaProxy.class_eval do
-    # Get an annotation using the annotation bridge.
-    def get(annotation)
-      base_class = (annotation.to_s.split('_')[0] == 'coref') ?
-      'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
-      'edu.stanford.nlp.ling.CoreAnnotations$'
-      anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
-      url = "#{base_class}#{anno_class}Annotation"
-      AnnotationBridge.getAnnotation(self, url)
-    end
-    # Shorthand for to_string defined by Java classes.
-    def to_s; to_string; end
-    # Provide Ruby-style iterators to wrap Java iterators.
-    def each
-      if !java_methods.include?('iterator()')
-        raise 'This object cannot be iterated.'
-      else
-        i = self.iterator
-        while i.has_next; yield i.next;end
-      end
-    end
-  end
 end

data/lib/stanford-core-nlp/jar-loader.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 module StanfordCoreNLP
   class JarLoader
     require 'rjb'
@@ -7,28 +6,32 @@ module StanfordCoreNLP
     # Configuration options.
     class << self
       # An array of flags to pass to the JVM machine.
-      attr_accessor :jvm_flags
+      attr_accessor :jvm_args
       attr_accessor :jar_path
       attr_accessor :log_file
     end
     # An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
-    self.jvm_flags = []
+    self.jvm_args = []
     # The path in which to look for Jars.
     self.jar_path = ''
-    # The name of the file to log to.
-    # Setting this before the parser automatically calls self.redirect_to_log
+    # By default, disable logging.
     self.log_file = nil
     # Load Rjb and create Java VM.
     def self.rjb_initialize
       return if ::Rjb::loaded?
-      ::Rjb::load(nil, self.jvm_flags)
-      redirect_to_log if self.log_file
+      ::Rjb::load(nil, self.jvm_args)
+      set_java_logging if self.log_file
     end
-    # Redirect the output of the JVM to self.log_file.
-    def self.redirect_to_log
+    # Enable logging.
+    def self.log(file = 'log.txt')
+      self.log_file = file
+    end
+    # Redirect the output of the JVM to supplied log file.
+    def self.set_java_logging
       const_set(:System, Rjb::import('java.lang.System'))
       const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
       const_set(:File2, Rjb::import('java.io.File'))
@@ -37,7 +40,7 @@ module StanfordCoreNLP
       System.setOut(ps)
       System.setErr(ps)
     end
     # Load a jar.
     def self.load(jar)
       self.rjb_initialize
@@ -48,14 +51,5 @@ module StanfordCoreNLP
       ::Rjb::add_jar(jar)
     end
-    # Silence output and log to file.
-    def self.log(file = 'log.txt')
-      @@log_file = file
-    end
-    # Whether the output is logged or not.
-    def self.log?; @@log_file; end
   end
 end

data/lib/stanford-core-nlp/java-wrapper.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module StanfordCoreNLP
+  # Modify the Rjb JavaProxy class to add our own methods to every Java object.
+  Rjb::Rjb_JavaProxy.class_eval do
+    # Dynamically defined on all proxied Java objects.
+    # Shorthand for to_string defined by Java classes.
+    def to_s; to_string; end
+    # Dynamically defined on all proxied Java iterators.
+    # Provide Ruby-style iterators to wrap Java iterators.
+    def each
+      if !java_methods.include?('iterator()')
+        raise 'This object cannot be iterated.'
+      else
+        i = self.iterator
+        while i.has_next; yield i.next; end
+      end
+    end
+    # Dynamically defined on all proxied annotation classes.
+    # Get an annotation using the annotation bridge.
+    def get(annotation)
+      if !java_methods.include?('get(Ljava.lang.Class;)')
+        raise 'No annotation can be retrieved on this object.'
+      else
+        base_class = (annotation.to_s.split('_')[0] == 'coref') ?
+        'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
+        'edu.stanford.nlp.ling.CoreAnnotations$'
+        anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
+        url = "#{base_class}#{anno_class}Annotation"
+        AnnotationBridge.getAnnotation(self, url)
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: stanford-core-nlp
 version: !ruby/object:Gem::Version
-  version: '0.1'
+  version: 0.1.1
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rjb
-  requirement: &70234870930100 !ruby/object:Gem::Requirement
+  requirement: &70258761325680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234870930100
-description: ! ' High-level Ruby bindings to the Stanford CoreNLP package, a set natural
-  language processing tools for English, including tokenization, part-of-speech tagging,
-  lemmatization, named entity recognition, parsing, and coreference resolution. '
+  version_requirements: *70258761325680
+description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
+  language processing \ntools for English, including tokenization, part-of-speech
+  tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
 email:
 - louis.mullie@gmail.com
 executables: []
@@ -32,6 +32,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/stanford-core-nlp/jar-loader.rb
+- lib/stanford-core-nlp/java-wrapper.rb
 - lib/stanford-core-nlp.rb
 - bin/bridge.jar
 - bin/classifiers/all.3class.distsim.crf.ser.gz
@@ -86,5 +87,5 @@ rubyforge_project:
 rubygems_version: 1.8.15
 signing_key:
 specification_version: 3
-summary: Ruby bindings to the Stanford CoreNLP tools.
+summary: Ruby bindings to the Stanford Core NLP tools.
 test_files: []